In [2]:
'''
    This tutorial shows an example of data anonymization. The code is largley based on the blog https://databricks.com/blog/2017/02/13/anonymizing-datasets-at-scale-leveraging-databricks-interoperability.html
    Code has been modified to better fit a tutorial setting.
    
'''

'\n    This tutorial shows an example of data anonymization. The code is largley based on the blog https://databricks.com/blog/2017/02/13/anonymizing-datasets-at-scale-leveraging-databricks-interoperability.html\n    Code has been modified to better fit a tutorial setting.\n    \n'

In [3]:
!pip install Faker unicodecsv

Collecting Faker
  Downloading https://files.pythonhosted.org/packages/79/36/8e1aa2f775018ea11a897bef32b6f80d78dcb6cc6563744f2e1dcf128b82/Faker-1.0.2-py2.py3-none-any.whl (845kB)
[K    100% |████████████████████████████████| 849kB 1.2MB/s eta 0:00:01
[?25hCollecting unicodecsv
  Downloading https://files.pythonhosted.org/packages/6f/a4/691ab63b17505a26096608cc309960b5a6bdf39e4ba1a793d5f9b1a53270/unicodecsv-0.14.1.tar.gz
Collecting text-unidecode==1.2 (from Faker)
  Downloading https://files.pythonhosted.org/packages/79/42/d717cc2b4520fb09e45b344b1b0b4e81aa672001dd128c180fabc655c341/text_unidecode-1.2-py2.py3-none-any.whl (77kB)
[K    100% |████████████████████████████████| 81kB 4.3MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: unicodecsv
  Running setup.py bdist_wheel for unicodecsv ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/a6/09/e9/e800279c98a0a8c94543f3de6c8a562f60e51363ed26e71283
Successfully built unicodecsv
Installing collected packages

In [4]:
import unicodecsv as csv
from faker import Faker
import pandas as pd
from collections import defaultdict

In [5]:
def anonymize_rows(rows):
    """
    Rows is an iterable of dictionaries that contain name and
    email fields that need to be anonymized.
    """
    # Load faker
    # Check localization options at https://github.com/joke2k/faker#localization
    faker  = Faker()

    # Create mappings of names, emails, social security numbers, and phone numbers to faked names & emails.
    names  = defaultdict(faker.name)
    emails = defaultdict(faker.email)
    ssns = defaultdict(faker.ssn)
    phone_numbers = defaultdict(faker.phone_number)
    
    # for _ in range(10):
    #  print(faker.name())

    # Iterate over the rows from the file and yield anonymized rows.
    for row in rows:
        # Replace name and email fields with faked fields.
        row["name"]  = names[row["name"]]
        row["email"] = emails[row["email"]]
        row["ssn"] = ssns[row["ssn"]]
        row["phone_number"] = phone_numbers[row["phone_number"]]


        # Yield the row back to the caller
        yield row

In [6]:
def anonymize(source, target):
    """
    The source argument is a path to a CSV file containing data to anonymize,
    while target is a path to write the anonymized CSV data to.
    """
    with open(source, 'rU') as f:
        with open(target, 'w') as o:
            # Use the DictReader to easily extract fields
            reader = csv.DictReader(f)
            writer = csv.DictWriter(o, reader.fieldnames)
            writer.writeheader()

            # Read and anonymize data, writing to target file.
            for row in anonymize_rows(reader):
                writer.writerow(row)

In [7]:
input_file = "sample_data/data.csv"
output_file = "sample_data/data_anonymized.csv"

In [8]:
#Show data before anonymization
data = pd.read_csv(input_file)
data.head()

Unnamed: 0,name,email,ssn,phone_number,drugs,marital status
0,Griffin Spears,metus.Aenean@justofaucibus.edu,052-89-6340,1-609-529-6386,APAP/Codeine,Married
1,Theodore Kim,aliquam.iaculis@urna.co.uk,591-59-1129,1-519-448-6170,Lovastatin,Married
2,Kieran Fletcher,Quisque.imperdiet@rhoncusProinnisl.org,099-49-9523,1-505-428-2353,Triamterene/Hydrochlorothiazide,Common-Law
3,Griffin Spears,metus.Aenean@justofaucibus.edu,052-89-6340,1-609-529-6386,Amlodipine Besylate,Married
4,Russell Gross,Vestibulum.ante.ipsum@fringillaporttitorvulput...,624-60-2852,1-766-854-8315,Simvastatin,Single


In [9]:
#Call anonymization on the input data file
anonymize(input_file, output_file )

In [10]:
#Show data after anonymization
data_anonymized = pd.read_csv(output_file)
data_anonymized.head()

Unnamed: 0,name,email,ssn,phone_number,drugs,marital status
0,Brian Bonilla,johnstewart@montoya.org,643-19-2478,(804)433-1033,APAP/Codeine,Married
1,Adam Clark,peter18@gmail.com,240-56-0081,(068)407-7212,Lovastatin,Married
2,Natalie Brown,paul28@payne.net,073-38-8728,+1-878-245-5746x5773,Triamterene/Hydrochlorothiazide,Common-Law
3,Brian Bonilla,johnstewart@montoya.org,643-19-2478,(804)433-1033,Amlodipine Besylate,Married
4,Carol Lewis,jenny53@moss.com,574-06-2618,001-087-462-1799,Simvastatin,Single
