In [8]:
'''
    This tutorial shows an example of data anonymization. The code is largley based on the blog https://databricks.com/blog/2017/02/13/anonymizing-datasets-at-scale-leveraging-databricks-interoperability.html
    Code has been modified to better fit a tutorial setting.
    
    Before running the code, you need to create a file data.csv and upload it under sample_date
    You can use the sample data file provided here: https://github.com/iman-saleh/ethical-ai/tree/master/sample_data
    
'''

'\n    This tutorial shows an example of data anonymization. The code is largley based on the blog https://databricks.com/blog/2017/02/13/anonymizing-datasets-at-scale-leveraging-databricks-interoperability.html\n    Code has been modified to better fit a tutorial setting.\n    \n'

In [9]:
!git clone https://github.com/iman-saleh/ethical-ai 

Cloning into 'ethical-ai'...
remote: Enumerating objects: 199, done.[K
remote: Counting objects: 100% (199/199), done.[K
remote: Compressing objects: 100% (156/156), done.[K
remote: Total 199 (delta 58), reused 165 (delta 36), pack-reused 0[K
Receiving objects: 100% (199/199), 9.32 MiB | 16.15 MiB/s, done.
Resolving deltas: 100% (58/58), done.


In [2]:
!pip install Faker unicodecsv

Collecting Faker
[?25l  Downloading https://files.pythonhosted.org/packages/f5/7e/41e4efbec4722a6b0fe44acffed08b9477b9334c3e40a7a877291a20e7fe/Faker-1.0.4-py2.py3-none-any.whl (846kB)
[K    100% |████████████████████████████████| 849kB 14.6MB/s 
[?25hCollecting unicodecsv
  Downloading https://files.pythonhosted.org/packages/6f/a4/691ab63b17505a26096608cc309960b5a6bdf39e4ba1a793d5f9b1a53270/unicodecsv-0.14.1.tar.gz
Building wheels for collected packages: unicodecsv
  Building wheel for unicodecsv (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/a6/09/e9/e800279c98a0a8c94543f3de6c8a562f60e51363ed26e71283
Successfully built unicodecsv
Installing collected packages: Faker, unicodecsv
Successfully installed Faker-1.0.4 unicodecsv-0.14.1


In [0]:
import unicodecsv as csv
from faker import Faker
import pandas as pd
from collections import defaultdict

In [0]:
def anonymize_rows(rows):
    """
    Rows is an iterable of dictionaries that contain name and
    email fields that need to be anonymized.
    """
    # Load faker
    # Check localization options at https://github.com/joke2k/faker#localization
    faker  = Faker()

    # Create mappings of names, emails, social security numbers, and phone numbers to faked names & emails.
    names  = defaultdict(faker.name)
    emails = defaultdict(faker.email)
    ssns = defaultdict(faker.ssn)
    phone_numbers = defaultdict(faker.phone_number)
    
    # for _ in range(10):
    #  print(faker.name())

    # Iterate over the rows from the file and yield anonymized rows.
    for row in rows:
        # Replace name and email fields with faked fields.
        row["name"]  = names[row["name"]]
        row["email"] = emails[row["email"]]
        row["ssn"] = ssns[row["ssn"]]
        row["phone_number"] = phone_numbers[row["phone_number"]]


        # Yield the row back to the caller
        yield row

In [0]:
def anonymize(source, target):
    """
    The source argument is a path to a CSV file containing data to anonymize,
    while target is a path to write the anonymized CSV data to.
    """
    with open(source, 'rU') as f:
        with open(target, 'w') as o:
            # Use the DictReader to easily extract fields
            reader = csv.DictReader(f)
            writer = csv.DictWriter(o, reader.fieldnames)
            writer.writeheader()

            # Read and anonymize data, writing to target file.
            for row in anonymize_rows(reader):
                writer.writerow(row)

In [0]:
input_file = "sample_data/data.csv"
output_file = "sample_data/data_anonymized.csv"

In [10]:
#Show data before anonymization
data = pd.read_csv(input_file)
data.head()

Unnamed: 0,name,email,ssn,phone_number,drugs,marital status
0,Griffin Spears,metus.Aenean@justofaucibus.edu,052-89-6340,1-609-529-6386,APAP/Codeine,Married
1,Theodore Kim,aliquam.iaculis@urna.co.uk,591-59-1129,1-519-448-6170,Lovastatin,Married
2,Kieran Fletcher,Quisque.imperdiet@rhoncusProinnisl.org,099-49-9523,1-505-428-2353,Triamterene/Hydrochlorothiazide,Common-Law
3,Griffin Spears,metus.Aenean@justofaucibus.edu,052-89-6340,1-609-529-6386,Amlodipine Besylate,Married
4,Russell Gross,Vestibulum.ante.ipsum@fringillaporttitorvulput...,624-60-2852,1-766-854-8315,Simvastatin,Single


In [0]:
#Call anonymization on the input data file
anonymize(input_file, output_file )

In [13]:
#Show data after anonymization
data_anonymized = pd.read_csv(output_file)
data_anonymized.head()

Unnamed: 0,name,email,ssn,phone_number,drugs,marital status
0,Molly Hall,rbenson@yahoo.com,083-07-0855,9734077314,APAP/Codeine,Married
1,Robert Elliott,carrie68@roman.com,285-57-5250,250.717.5453,Lovastatin,Married
2,Kenneth Brown,amandatodd@hart-cox.com,867-76-5145,(055)191-8979x2447,Triamterene/Hydrochlorothiazide,Common-Law
3,Molly Hall,rbenson@yahoo.com,083-07-0855,9734077314,Amlodipine Besylate,Married
4,Andrea Collins,rcollier@gmail.com,466-82-1258,(519)528-9054x423,Simvastatin,Single
