In [1]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import diffprivlib.models as dpm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import pandas as pd
import numpy as np


# Load data

In [2]:
trump_data = pd.read_csv('../data/2016-cces-trump.csv')

# Visualize data


In [3]:
trump_data.head


<bound method NDFrame.head of          uid stateabb  inputstate  race  angryracism  whiteadv  fearraces  \
0          1       NH          33     1          2.0       3.0        1.0   
1          2       LA          22     1          1.0       4.0        1.0   
2          3       MO          29     2          NaN       NaN        NaN   
3          4       AL           1     2          NaN       NaN        NaN   
4          5       CO           8     1          2.0       1.0        1.0   
...      ...      ...         ...   ...          ...       ...        ...   
64595  64596       NY          36     2          NaN       NaN        NaN   
64596  64597       CO           8     1          NaN       NaN        NaN   
64597  64598       LA          22     1          NaN       NaN        NaN   
64598  64599       RI          44     1          NaN       NaN        NaN   
64599  64600       AL           1     1          NaN       NaN        NaN   

       racerare  acograc  aemprac  ...  se_lr

### TODO: Figure out what to do about the NaNs. For now, replacing with N(0,1) noise

# Cleaning data

In [4]:
trump_data.dropna(subset='votetrump')
trump_data.fillna(0, inplace = True)

In [5]:
for col in trump_data.columns:
    values = [v for v in trump_data[col].values if v!=np.nan]

    if isinstance(values[0], str):
        print(str(values[0]))
        map = {v:i for i, v in enumerate(values)}
        trump_data[col].replace(map, inplace=True)

NH
New Hampshire
Northeast
New England
3: Gen X


In [6]:
y = trump_data['votetrump'].to_numpy()
X = trump_data.loc[:, trump_data.columns!='votetrump'].to_numpy()
X = preprocessing.normalize(X, norm='l2')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train = X_train[:100]
y_train = y_train[:100]

# Baseline Model

In [8]:
pca = PCA(n_components=5)
tX_train = pca.fit_transform(X_train)
tX_test = pca.transform(X_test)

In [9]:
clf = LogisticRegression(random_state = 0, max_iter = 200).fit(tX_train, y_train)

In [10]:
print(f"Performance on training data {clf.score(tX_train, y_train)}")
print(f"Performance on test data {clf.score(tX_test, y_test)}")

Performance on training data 0.68
Performance on test data 0.7074303405572755


# PCA => DP Logistic Regression

In [11]:
pca = PCA(n_components=5)
dp_tX_train = pca.fit_transform(X_train)
dp_tX_test = pca.transform(X_test)

In [12]:
dp_clf = dpm.LogisticRegression(data_norm=1, max_iter = 200).fit(dp_tX_train, y_train)

In [13]:
print(f"Performance on training data {dp_clf.score(dp_tX_train, y_train)}")
print(f"Performance on test data {dp_clf.score(dp_tX_test, y_test)}")

Performance on training data 0.48
Performance on test data 0.49572755417956654


# DP PCA => Logistic Regression

In [14]:
dp2_pca = dpm.PCA(n_components=5,data_norm= 1, bounds = (-1, 1))
dp2_tX_train = dp2_pca.fit_transform(X_train)
dp2_tX_test = dp2_pca.transform(X_test)



prob 0.013231997858153422
self._rng.random() 0.4685538760616974
prob 0.013952435975919154
self._rng.random() 0.4324302431817849
prob 0.013315132662663922
self._rng.random() 0.21339742579553322
prob 0.015714368814742537
self._rng.random() 0.2874793221953801
prob 0.015659022859041697
self._rng.random() 0.10224307046518877
prob 0.013302457608746588
self._rng.random() 0.8948994883373231
prob 0.014855850020596488
self._rng.random() 0.9731080265444951
prob 0.013450884518384238
self._rng.random() 0.21539844894483473
prob 0.014352376637061775
self._rng.random() 0.657478377720339
prob 0.013440887438317254
self._rng.random() 0.18634160435465497
prob 0.013997669897831825
self._rng.random() 0.17042454211314928
prob 0.013146908677644611
self._rng.random() 0.6125096489134658
prob 0.013089977312951456
self._rng.random() 0.654635001055179
prob 0.013164795031693939
self._rng.random() 0.6955918770091517
prob 0.013088984873795263
self._rng.random() 0.2852526267362626
prob 0.013650007815503849
self._rng.r

In [16]:
dp2_clf = LogisticRegression(random_state = 0, max_iter = 200).fit(tX_train, y_train)

In [17]:
print(f"Performance on training data {dp2_clf.score(dp2_tX_train, y_train)}")
print(f"Performance on test data {dp2_clf.score(dp2_tX_test, y_test)}")

Performance on training data 0.68
Performance on test data 0.7074303405572755
