In [1]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import diffprivlib.models as dpm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import pandas as pd
import numpy as np


# Load data

In [2]:
trump_data = pd.read_csv('../data/2016-cces-trump.csv')

# Visualize data


In [3]:
trump_data.head


<bound method NDFrame.head of          uid stateabb  inputstate  race  angryracism  whiteadv  fearraces  \
0          1       NH          33     1          2.0       3.0        1.0   
1          2       LA          22     1          1.0       4.0        1.0   
2          3       MO          29     2          NaN       NaN        NaN   
3          4       AL           1     2          NaN       NaN        NaN   
4          5       CO           8     1          2.0       1.0        1.0   
...      ...      ...         ...   ...          ...       ...        ...   
64595  64596       NY          36     2          NaN       NaN        NaN   
64596  64597       CO           8     1          NaN       NaN        NaN   
64597  64598       LA          22     1          NaN       NaN        NaN   
64598  64599       RI          44     1          NaN       NaN        NaN   
64599  64600       AL           1     1          NaN       NaN        NaN   

       racerare  acograc  aemprac  ...  se_lr

### TODO: Figure out what to do about the NaNs. For now, replacing with 0 noise

# Cleaning data

In [4]:
trump_data.dropna(subset='votetrump')
trump_data.fillna(0, inplace = True)

In [5]:
for col in trump_data.columns:
    values = [v for v in trump_data[col].values if v!=np.nan]

    if isinstance(values[0], str):
        print(str(values[0]))
        map = {v:i for i, v in enumerate(values)}
        trump_data[col].replace(map, inplace=True)

NH
New Hampshire
Northeast
New England
3: Gen X


In [6]:
y = trump_data['votetrump'].to_numpy()
X = trump_data.loc[:, trump_data.columns!='votetrump'].to_numpy()
X = preprocessing.normalize(X, norm='l2')

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train = X_train[1:15000]
y_train = y_train[1:15000]

# Baseline Model

In [40]:
pca = PCA(n_components=5)
tX_train = pca.fit_transform(X_train)
tX_test = pca.transform(X_test)

In [41]:
clf = LogisticRegression(random_state = 0, max_iter = 200).fit(tX_train, y_train)

In [42]:
print(f"Performance on training data {clf.score(tX_train, y_train)}")
print(f"Performance on test data {clf.score(tX_test, y_test)}")

Performance on training data 0.7065137675845057
Performance on test data 0.7081114551083592


# PCA => DP Logistic Regression

In [43]:
pca = PCA(n_components=5)
dp_tX_train = pca.fit_transform(X_train)
dp_tX_test = pca.transform(X_test)

In [44]:
dp_clf = dpm.LogisticRegression(data_norm=1, max_iter = 200).fit(dp_tX_train, y_train)

In [45]:
print(f"Performance on training data {dp_clf.score(dp_tX_train, y_train)}")
print(f"Performance on test data {dp_clf.score(dp_tX_test, y_test)}")

Performance on training data 0.7065137675845057
Performance on test data 0.7081114551083592


# DP PCA => Logistic Regression

In [46]:
dp2_pca = dpm.PCA(n_components=5,data_norm= 1, bounds = (-1, 1))
dp2_tX_train = dp2_pca.fit_transform(X_train)
dp2_tX_test = dp2_pca.transform(X_test)



prob 3.409276857728183e-07
self._rng.random() 0.4691793034305065
prob 3.482328852821268e-07
self._rng.random() 0.3165910357415579
prob 3.864004095800001e-07
self._rng.random() 0.8967564002801857
prob 3.133924020043744e-07
self._rng.random() 0.7024399033355715
prob 5.087649639917684e-07
self._rng.random() 0.488700537980685
prob 5.731306125360196e-07
self._rng.random() 0.9795563351990677
prob 6.099559800904995e-07
self._rng.random() 0.7737669556571758
prob 4.155415845441715e-07
self._rng.random() 0.1618280091816363
prob 4.120685978104495e-07
self._rng.random() 0.7917101531641206
prob 3.741576463896898e-07
self._rng.random() 0.05249042194584885
prob 3.0877700693223714e-07
self._rng.random() 0.46365190487295704
prob 3.009235807119325e-07
self._rng.random() 0.5809455840948127
prob 4.283793875712188e-07
self._rng.random() 0.6290089187008701
prob 3.2678792651462876e-07
self._rng.random() 0.9785632648441297
prob 1.1634806999265506e-06
self._rng.random() 0.6866031413207764
prob 5.79111152406985

KeyboardInterrupt: 

In [15]:
dp2_clf = LogisticRegression(random_state = 0, max_iter = 200).fit(tX_train, y_train)

In [16]:
print(f"Performance on training data {dp2_clf.score(dp2_tX_train, y_train)}")
print(f"Performance on test data {dp2_clf.score(dp2_tX_test, y_test)}")

Performance on training data 0.68
Performance on test data 0.7090402476780185
