## Load data 

In [3]:
import pandas as pd

generic = pd.read_csv("Data/generic_mrp_results.csv")
specific = pd.read_csv("Data/specific_mrp_results.csv")
result = pd.read_csv("Data/2018_results.csv", encoding="cp1252")

In [4]:
generic = generic[generic.Demographic_Type == 'District']
specific = specific[specific.Demographic_Type == 'District']

generic = generic.dropna()
specific = specific.dropna()

In [5]:
generic.shape, specific.shape

((428, 3), (428, 3))

In [6]:
assert sum(generic.Demographic == specific.Demographic)== 428

## Preprocessing

In [7]:
# select the results only with avaiable district 

result = result[result.Dist.isin(generic.Demographic)]
result = result[['REP.', 'Dist']]
result.shape

(428, 2)

In [8]:
# pickup available results 

result = result[result['REP.'] != '—']
result = result[result['REP.'] != 'Unc.']

In [9]:
# convert text to float

result['REP.'] = result['REP.'].str.rstrip('%').astype('float')/100

result.shape

(388, 2)

In [10]:
generic = generic[generic.Demographic.isin(result.Dist)]
specific = specific[specific.Demographic.isin(result.Dist)]

In [11]:
specific = specific.sort_values(by = ['Demographic'])
generic = generic.sort_values(by = ['Demographic'])
result = result.sort_values(by = ['Dist'])
specific = specific.reset_index()
generic = generic.reset_index()
result = result.reset_index()

In [12]:
y_g = generic.Vote_R
y_s = specific.Vote_R
y_r = result['REP.']

## Model Comparison 

In [14]:
# Mean square error
import numpy as np
mse_g = np.mean((y_g - y_r)**2)
mse_s = np.mean((y_s - y_r)**2)

In [15]:
mse_g, mse_s

(0.005918537234662504, 0.0047357923785882345)

In [16]:
# accuracy, in terms of whether republican is a majority 
y_g_acc = np.where(y_g > 0.5, 1,0)
y_s_acc = np.where(y_s > 0.5, 1,0)
y_r_acc = np.where(y_r > 0.5, 1,0)

In [17]:
accuracy_g = sum(y_g_acc == y_r_acc)/len(y_g)
accuracy_s = sum(y_s_acc == y_r_acc)/len(y_g)

In [18]:
accuracy_g, accuracy_s

(0.8582474226804123, 0.8788659793814433)

In [19]:
Comparison = {'Model':['generic', 'specific'],
              'Accuracy':[accuracy_g, accuracy_s],
              'MSE':[mse_g, mse_s]}

df = pd.DataFrame(Comparison)

In [20]:
df

Unnamed: 0,Model,Accuracy,MSE
0,generic,0.858247,0.005919
1,specific,0.878866,0.004736


In [None]:
df.to_csv("Model_comparison.csv")