# Anonymization with hash values

In [9]:
import hashlib as hl

# Create a message we want to be secret
# .encode() encodes it as utf8
secret = "gore_bord@gmail.com".encode()

print(secret)

# Choose a hassh function from hl
# Which to chose? Depends on what you need, read documentation
# This creates a hash object
print(hl.sha256(secret))

# Get the hexadecimal value for this:
print(hl.sha256(secret).hexdigest())

# Check that it's the same with a new object
print(hl.sha256("gore_bord@gmail.com".encode()).hexdigest())




b'gore_bord@gmail.com'
<sha256 HASH object @ 0x7f13511a7cd0>
7d6d9a849ccc7f5febe065ebe3b4f39558fc96ef865e02333dd7b7426ff0c057
7d6d9a849ccc7f5febe065ebe3b4f39558fc96ef865e02333dd7b7426ff0c057


In [12]:
# Test with two very similar messages
secret1 = "gore_bord@gmail.com".encode()
secret2 = "gore_bord1@gmail.com".encode()

# And it's VERY different
print(hl.sha256(secret1).hexdigest())
print(hl.sha256(secret2).hexdigest())

# Check similarities with previous
hl.sha256(secret).hexdigest() == hl.sha256(secret1).hexdigest()


7d6d9a849ccc7f5febe065ebe3b4f39558fc96ef865e02333dd7b7426ff0c057
0290afd54f254c13b9b8c0d6037e4aaf26c65bb025b1815f2ef8a81f3042d7dc


True

# Examples with Pandas

In [19]:
import pandas as pd

bank = pd.read_csv("../data/BankChurners.csv")

print(bank.columns)

# A lot of columns, but we want to hash the client number, CLIENTNUM
print(bank["CLIENTNUM"].head())

# They are ints, change to string to be able to hash these
bank["CLIENTNUM"] = bank["CLIENTNUM"].astype(str)
print(bank["CLIENTNUM"].head())

Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')
0    768805383
1    818770008
2    713982108
3    769911858
4    709106358
Name: CLIENTNUM, dtype: int64
0    768805383
1    818770008
2    713982108
3    769911858
4    709106358
Name: CLIENTNUM, dtype: object


In [25]:
# Time to hash client numbers
# Before we hashed one str, now we want to do this to a series
# Don't loop! Takes time, use apply!
# Apply wants a function to apply to every element in the series
# Use a lambda function (anonymous function)
hash_series = bank["CLIENTNUM"].apply(lambda x: hl.sha256(x.encode()).hexdigest())
print(hash_series.head())

# Add to our bank
# use insert, 1: where (which column), then name
bank.insert(1, "Hashed clientnum", hash_series)


0    c9bbef56f9d8292cb3cfa8ae91f9b9167390e6e4b514d5...
1    7996e2340d70489252370a5df035ec99381c8344cc3511...
2    6fb53dbc743724e086243b5bc288df62b4a6dc1b8bde92...
3    f86b86a1047317685f29c399059b199858685faf5ec6a8...
4    0d239470b0cb57e110cf60bc3865344ee2cdced6e3acdc...
Name: CLIENTNUM, dtype: object


In [26]:
# Check
bank.head()

Unnamed: 0,CLIENTNUM,Hashed clientnum,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,c9bbef56f9d8292cb3cfa8ae91f9b9167390e6e4b514d5...,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,7996e2340d70489252370a5df035ec99381c8344cc3511...,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,6fb53dbc743724e086243b5bc288df62b4a6dc1b8bde92...,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,f86b86a1047317685f29c399059b199858685faf5ec6a8...,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,0d239470b0cb57e110cf60bc3865344ee2cdced6e3acdc...,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998
