In [1]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import diffprivlib.models as dpm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import pandas as pd
import numpy as np

# Load data

In [2]:
trump_data = pd.read_csv('../data/2016-cces-trump.csv')

# Visualize data


In [3]:
trump_data.head

<bound method NDFrame.head of          uid stateabb  inputstate  race  angryracism  whiteadv  fearraces  \
0          1       NH          33     1          2.0       3.0        1.0   
1          2       LA          22     1          1.0       4.0        1.0   
2          3       MO          29     2          NaN       NaN        NaN   
3          4       AL           1     2          NaN       NaN        NaN   
4          5       CO           8     1          2.0       1.0        1.0   
...      ...      ...         ...   ...          ...       ...        ...   
64595  64596       NY          36     2          NaN       NaN        NaN   
64596  64597       CO           8     1          NaN       NaN        NaN   
64597  64598       LA          22     1          NaN       NaN        NaN   
64598  64599       RI          44     1          NaN       NaN        NaN   
64599  64600       AL           1     1          NaN       NaN        NaN   

       racerare  acograc  aemprac  ...  se_lr

### TODO: Figure out what to do about the NaNs. For now, replacing with N(0,1) noise

# Cleaning data

In [4]:
trump_data.dropna(subset=['votetrump'])
trump_data.fillna(0, inplace = True)

In [5]:
for col in trump_data.columns:
    values = [v for v in trump_data[col].values if v!=np.nan]

    if isinstance(values[0], str):
        print(str(values[0]))
        map = {v:i for i, v in enumerate(values)}
        trump_data[col].replace(map, inplace=True)

NH
New Hampshire
Northeast
New England
3: Gen X


In [6]:
y = trump_data['votetrump'].to_numpy()
X = trump_data.loc[:, trump_data.columns!='votetrump'].to_numpy()
X = preprocessing.normalize(X, norm='l2')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Baseline Model

In [8]:
pca = PCA(n_components=5)
tX_train = pca.fit_transform(X_train)
tX_test = pca.transform(X_test)

In [13]:
clf = LogisticRegression(random_state = 0, max_iter = 200).fit(tX_train, y_train)

In [14]:
print(f"Performance on training data {clf.score(tX_train, y_train)}")
print(f"Performance on test data {clf.score(tX_test, y_test)}")

Performance on training data 0.7071620227038183
Performance on test data 0.7071207430340557


# PCA => DP Logistic Regression

In [15]:
pca = PCA(n_components=5)
tX_train = pca.fit_transform(X_train)
tX_test = pca.transform(X_test)



In [18]:
dp_clf = dpm.LogisticRegression(random_state = 0, max_iter = 200).fit(tX_train, y_train)



In [19]:
print(f"Performance on training data {dp_clf.score(tX_train, y_train)}")
print(f"Performance on test data {dp_clf.score(tX_test, y_test)}")

Performance on training data 0.7071620227038183
Performance on test data 0.7071207430340557


# DP PCA => Logistic Regression

In [20]:
dp_pca = dpm.PCA(n_components=5,data_norm= 1)
tX_train = dp_pca.fit_transform(X_train[1:15000,])
tX_test = dp_pca.transform(X_test)



This will result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `range` for each valued returned by np.mean().


In [21]:
dp_pca

PCA(accountant=BudgetAccountant(spent_budget=[(1.0, 0), (1.0, 0)]),
    bounds=(array([ 1.38461362e-05,  4.07288755e-01,  6.31971427e-06,  6.31971427e-06,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -8.37783146e-06,  0.00000000e+00,
       -4.31002934e-06,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00...
       6.95809244e-06, 6.95809117e-06, 6.95809244e-06, 6.93534049e-06,
       6.93237659e-06, 6.93509871e-06, 9.01339830e-05, 3.47364756e-05,
       6.94691716e-06, 2.78191766e-05, 4.16522404e-05, 4.86835591e-05,
       1.17791665e-05, 4.21370483e-06, 7.71971867e-06, 1.37005004e-05,
       7.93064606e-06, 1.01471169e-05, 5.61797424e-06, 6.30487632e-06,
       6.39245644e-06, 7.20512295e-06, 1.00675566e-05])),
    data_norm=1, n_components=5)

In [22]:
clf_dfPCA = LogisticRegression(random_state = 0, max_iter = 200).fit(tX_train, y_train[1:15000])
print(f"Performance on training data {clf_dfPCA.score(tX_train, y_train[1:15000])}")
print(f"Performance on test data {clf_dfPCA.score(tX_test, y_test)}")

Performance on training data 0.7095806387092473
Performance on test data 0.7071207430340557
