# Modeling the 2016 US Presidential Election using Logistic Regression

## Loading the data:

In [3]:
import pandas as pd

census_data = pd.read_csv('combined_data.csv')

feature_cols = ['BLACK_FEMALE_rate', 
                'BLACK_MALE_rate',
                'Percent of adults with a bachelor\'s degree or higher, 2010-2014',
                'ASIAN_MALE_rate',
                'ASIAN_FEMALE_rate',
                '25-29_rate',
                'age_total_pop',
                '20-24_rate',
                'Deep_Pov_All',
                '30-34_rate',
                'Density per square mile of land area - Population',
                'Density per square mile of land area - Housing units',
                'Unemployment_rate_2015',
                'Deep_Pov_Children',
                'PovertyAllAgesPct2014',
                'TOT_FEMALE_rate',
                'PerCapitaInc',
                'MULTI_FEMALE_rate',
                '35-39_rate',
                'MULTI_MALE_rate',
                'Percent of adults completing some college or associate\'s degree, 2010-2014',
                '60-64_rate',
                '55-59_rate',
                '65-69_rate',
                'TOT_MALE_rate',
                '85+_rate',
                '70-74_rate',
                '80-84_rate',
                '75-79_rate',
                'Percent of adults with a high school diploma only, 2010-2014',
                'WHITE_FEMALE_rate',
                'WHITE_MALE_rate',
                'Amish',
                'Buddhist',
                'Catholic',
                'Christian Generic',
                'Eastern Orthodox',
                'Hindu',
                'Jewish',
                'Mainline Christian',
                'Mormon',
                'Muslim',
                'Non-Catholic Christian',
                'Other',
                'Other Christian',
                'Other Misc',
                'Pentecostal / Charismatic',
                'Protestant Denomination',
                'Zoroastrian']

X = census_data[feature_cols]
y = census_data['Democrat']

## K-folds cross-validation:

In [8]:
# import the class
from sklearn.linear_model import LogisticRegression

# set the predictor and target variables
X = census_data[feature_cols]
y = census_data['Democrat']

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X, y)

# test accuracy of the model using 10-fold cross-validation
scores = cross_val_score(logreg, X, y, cv=20, scoring='accuracy')
print(scores.mean())


0.901302848066


## View predictions and coefficients of each feature:

In [27]:
# print logistic regression coefficients of each feature
coef = logreg.coef_[0]
zipped = zip(feature_cols, coef)

print zipped

# predict the response for new observations

census_data['prediction'] = logreg.predict(X)
census_data.to_csv('census_data_with_predictions.csv')

[('BLACK_FEMALE_rate', 0.0078229309381132022), ('BLACK_MALE_rate', 0.0069043382714705859), ("Percent of adults with a bachelor's degree or higher, 2010-2014", 0.091172051942494059), ('ASIAN_MALE_rate', 0.00023539680415206711), ('ASIAN_FEMALE_rate', 0.0002870191125736659), ('25-29_rate', 0.00029004555631667703), ('age_total_pop', 6.0168087716794342e-07), ('20-24_rate', 5.1489211275486853e-05), ('Deep_Pov_All', 0.036246629158976897), ('30-34_rate', 0.00018329014068936507), ('Density per square mile of land area - Population', 7.6752781320599798e-05), ('Density per square mile of land area - Housing units', 0.002263385952343943), ('Unemployment_rate_2015', 0.052265257048178658), ('Deep_Pov_Children', 0.087733488496808093), ('PovertyAllAgesPct2014', 0.10247151670571386), ('TOT_FEMALE_rate', 0.00019044206485666377), ('PerCapitaInc', -2.6587756477380128e-05), ('MULTI_FEMALE_rate', 7.3127897730474207e-05), ('35-39_rate', 6.9788966925406215e-05), ('MULTI_MALE_rate', 7.0184134117694954e-05), ("