# Setup

In [1]:
# Notebook Setup
import pandas as pd

# Anonymizer Packages

# Anonymize DF
from anonymizedf.anonymizedf import anonymize
# Scrambler
from random import shuffle
# Faker
from faker import Faker
faker = Faker()

import warnings
warnings.filterwarnings("ignore")

# Generate Fake Dataframe

In [2]:
# Generate fake data
print("The Faker library can generate fake names. By running 'faker.name()', we get:")
faker.name()

The Faker library can generate fake names. By running 'faker.name()', we get:


'Nicole Mosley'

In [6]:
# Create a list of fake names
fake_names = [faker.name() for x in range (10)]
df = pd.DataFrame(fake_names, columns = ['Student'])
# Generate random test scores
import numpy as np
df['TestScore'] = np.random.randint(50, 100, df.shape[0])
# Export to CSV
df.to_csv('StudentTestScores.csv', index=False)

In [13]:
# Explore the dataframe
df

Unnamed: 0,Student,TestScore
0,Amanda Davis,66
1,Shannon Roberson,80
2,Jordan Cowan,67
3,James Jackson,95
4,Brooke Mendez,90
5,Paul Vaughn,64
6,Jennifer Bell,98
7,Victoria Wallace,92
8,Teresa Goodwin,91
9,Andrea Dixon,89


# Demonstrate Anonymization

In [14]:
# Scenario:
# Suppose a professor wants to share test scores, but does not want everyone's name and
# score to be publicly available:
df

Unnamed: 0,Student,TestScore
0,Amanda Davis,66
1,Shannon Roberson,80
2,Jordan Cowan,67
3,James Jackson,95
4,Brooke Mendez,90
5,Paul Vaughn,64
6,Jennifer Bell,98
7,Victoria Wallace,92
8,Teresa Goodwin,91
9,Andrea Dixon,89


## Anonymization Via AnonymizeDF

In [15]:
anon = anonymize(df)

In [16]:
# AnonymizeDF can generate fake names
anon.fake_names("Student")

Unnamed: 0,Student,TestScore,Fake_Student
0,Amanda Davis,66,Donald Thorpe
1,Shannon Roberson,80,Jane Taylor-Griffin
2,Jordan Cowan,67,Mark Bradley
3,James Jackson,95,Antony Brady
4,Brooke Mendez,90,Stephen Grant
5,Paul Vaughn,64,Nigel Wallace
6,Jennifer Bell,98,Mrs Zoe Pearce
7,Victoria Wallace,92,Dr Raymond Fox
8,Teresa Goodwin,91,Michelle Cook
9,Andrea Dixon,89,Lisa Smith-King


In [17]:
anon.fake_ids("Student")

Unnamed: 0,Student,TestScore,Fake_Student
0,Amanda Davis,66,OUDK95858519242798
1,Shannon Roberson,80,OSWH74372798088742
2,Jordan Cowan,67,LAPA85605875005585
3,James Jackson,95,FDDV70377918619353
4,Brooke Mendez,90,ELIS65225731984362
5,Paul Vaughn,64,HCHU53458707066984
6,Jennifer Bell,98,JHJJ89316053989557
7,Victoria Wallace,92,HLTS04292200988299
8,Teresa Goodwin,91,MQNN80889340360340
9,Andrea Dixon,89,CNSS64591255916757


In [18]:
anon.fake_categories("Student")

Unnamed: 0,Student,TestScore,Fake_Student
0,Amanda Davis,66,Student 1
1,Shannon Roberson,80,Student 2
2,Jordan Cowan,67,Student 3
3,James Jackson,95,Student 4
4,Brooke Mendez,90,Student 5
5,Paul Vaughn,64,Student 6
6,Jennifer Bell,98,Student 7
7,Victoria Wallace,92,Student 8
8,Teresa Goodwin,91,Student 9
9,Andrea Dixon,89,Student 10


## Anonymization Via Faker

In [19]:
# Reset dataframe
df = df.drop(columns=['Fake_Student'])

In [20]:
faker = Faker()
Faker.seed(4321)
dict_names = {name: faker.name() for name in df['Student'].unique()}
df['New Student Name'] = df['Student'].map(dict_names)

In [21]:
df

Unnamed: 0,Student,TestScore,New Student Name
0,Amanda Davis,66,Jason Brown
1,Shannon Roberson,80,Jacob Stein
2,Jordan Cowan,67,Cody Brown
3,James Jackson,95,Larry Morales
4,Brooke Mendez,90,Jessica Hendricks
5,Paul Vaughn,64,Brian Moore
6,Jennifer Bell,98,Scott Baker
7,Victoria Wallace,92,Ruth Hoffman
8,Teresa Goodwin,91,Daniel George
9,Andrea Dixon,89,David Moody


In [22]:
# Faker can generate alternate fake data:
print(faker.address())

54933 Sanchez Circles
Randallmouth, MN 86691


In [23]:
# Faker can also generate random text
print(faker.text())

Federal lot next senior. Final artist series attorney office house institution month. Course TV music sort base.


## Word Scrambling

In [24]:
# Reset dataframe
df = df.drop(columns=['New Student Name'])

In [34]:
# Create a scrambler function
def word_scrambler(word):
    word = list(word)
    shuffle(word)
    return ''.join(word)

In [36]:
df['ScrambledName'] = df.Student.apply(word_scrambler)
df['ScrambledName'] = df['ScrambledName'].str.replace(" ","")

In [37]:
df

Unnamed: 0,Student,TestScore,ScrambledName
0,Amanda Davis,66,ivnaADaadsm
1,Shannon Roberson,80,nSrahoboRnnsneo
2,Jordan Cowan,67,ononawdraJC
3,James Jackson,95,JaomakcssJen
4,Brooke Mendez,90,BekozoeenMdr
5,Paul Vaughn,64,nuauVgPlah
6,Jennifer Bell,98,fJilrnneBele
7,Victoria Wallace,92,litaaoWeaVrccli
8,Teresa Goodwin,91,arwednGoToeis
9,Andrea Dixon,89,nodADirexna


## Anonymization and De-Anonymization Techniques

In [28]:
# Reset dataframe
df = df.drop(columns=['ScrambledName'])

In [29]:
# Create the Fake Student Names
anon = anonymize(df)
anon.fake_names('Student')

# Create a "Key"
dfKey = df[['Student', 'Fake_Student']]
dfKey.to_csv('key.csv')

df = df.assign(Student = df['Fake_Student'])
df = df.drop(columns='Fake_Student')

df

Unnamed: 0,Student,TestScore
0,Raymond Wilson,66
1,Ashleigh Griffiths,80
2,Suzanne Rose,67
3,Ms Emily Nelson,95
4,Victoria Ryan-Wyatt,90
5,Luke Coles,64
6,Hugh Jordan,98
7,Mr Robin Jones,92
8,Geraldine Davies,91
9,Dr Luke Johnston,89


In [30]:
# Load in the decoder key
dfKey = pd.read_csv('key.csv')
# Return to the original Data
df['Student'] = df['Student'].map(dfKey.set_index('Fake_Student')['Student'])

In [31]:
df

Unnamed: 0,Student,TestScore
0,Amanda Davis,66
1,Shannon Roberson,80
2,Jordan Cowan,67
3,James Jackson,95
4,Brooke Mendez,90
5,Paul Vaughn,64
6,Jennifer Bell,98
7,Victoria Wallace,92
8,Teresa Goodwin,91
9,Andrea Dixon,89
