# Texas Politics - Exploratory Analysis 

In [1]:
#import necessary packages for analysis 
import pandas as pd
from pandas import Series, DataFrame
%pylab inline
import numpy as np
import requests
import json
import os
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import Lasso, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score 

import warnings
warnings.simplefilter("ignore")

Populating the interactive namespace from numpy and matplotlib


## Data Pre-Processing 

In [2]:
#demopgraphic data state of texas
#read in files
demo_2011 = pd.read_csv('2011data.csv')
demo_2015 = pd.read_csv('2015data.csv')

In [3]:
#reducing data to include all ages 
demo_2011 = demo_2011[demo_2011['Age'] == 'All Ages']
demo_2015 = demo_2015[demo_2015['Age'] == 'All Ages']

In [4]:
demo_2015.head()

Unnamed: 0,County,FIPS,Age,Total,Total Male,Total Female,Anglo Total,Anglo Male,Anglo Female,Black Total,Black Male,Black Female,Other Total,Other Male,Other Female,Hispanic Total,Hispanic Male,Hispanic Female
0,STATE OF TEXAS,0,All Ages,27469114,13662417,13806697,11505371,5697147,5808224,3171043,1533464,1637579,1793580,878503,915077,10999120,5553303,5445817
87,ANDERSON COUNTY,1,All Ages,58906,35791,23115,34795,19265,15530,12309,8896,3413,1571,760,811,10231,6870,3361
174,ANDREWS COUNTY,3,All Ages,17920,8964,8956,7915,3857,4058,236,118,118,362,176,186,9407,4813,4594
261,ANGELINA COUNTY,5,All Ages,90373,44660,45713,54422,26684,27738,13427,6408,7019,2434,1151,1283,20090,10417,9673
348,ARANSAS COUNTY,7,All Ages,25110,12379,12731,17078,8303,8775,269,149,120,983,480,503,6780,3447,3333


In [5]:
#need to drop the first row because we don't need total texas numbers
demo_2015=demo_2015.drop(demo_2015.index[0])

In [6]:
#voting results in recent elections in Texas 
#read in file
election_results = pd.read_csv('2016Results.csv')

In [7]:
#filter election data to get only counties in Texas
texas_results = election_results[election_results['State Code']=='TX']
texas_results.head()
#254 counties in the state of texas 

Unnamed: 0,State Code,County Name,County Population,Clinton or Trump State,Clinton,Trump,Total,% Clinton,% Trump,Vote Difference C-T,...,County FIPS Code,CBSA Code,CSA Code,CBSA Title,CBSA Population,Metropolitan/Micropolitan Statistical Area,CSA Title,Metropolitan Division Title,Metropolitan Division Code,Central/Outlying County
2494,TX,Anderson County,57580,Trump,3358,13165,16887,19.89%,77.96%,"(9,807)",...,1.0,37300.0,,"Palestine, TX",,Micropolitan Statistical Area,,,,Central
2495,TX,Andrews County,18105,Trump,836,3925,4926,16.97%,79.68%,"(3,089)",...,3.0,11380.0,,"Andrews, TX",,Micropolitan Statistical Area,,,,Central
2496,TX,Angelina County,88255,Trump,7538,21666,29870,25.24%,72.53%,"(14,128)",...,5.0,31260.0,,"Lufkin, TX",,Micropolitan Statistical Area,,,,Central
2497,TX,Aransas County,25350,Trump,2458,7730,10467,23.48%,73.85%,"(5,272)",...,7.0,18580.0,204.0,"Corpus Christi, TX",452422.0,Metropolitan Statistical Area,"Corpus Christi-Kingsville-Alice, TX",,,Outlying
2498,TX,Archer County,8715,Trump,394,3785,4269,9.23%,88.66%,"(3,391)",...,9.0,48660.0,,"Wichita Falls, TX",150780.0,Metropolitan Statistical Area,,,,Outlying


In [8]:
texas_results["Metropolitan/Micropolitan Statistical Area"].value_counts()

Metropolitan Statistical Area    82
Micropolitan Statistical Area    47
Name: Metropolitan/Micropolitan Statistical Area, dtype: int64

In [9]:
#we need to convert Metro & Micro statistical area to a dummy variable (0,1) so we can use it in the model
dummy = pd.get_dummies(texas_results["Metropolitan/Micropolitan Statistical Area"])
dummy.head() #checking to make sure it worked 

Unnamed: 0,Metropolitan Statistical Area,Micropolitan Statistical Area
2494,0,1
2495,0,1
2496,0,1
2497,1,0
2498,1,0


In [10]:
#making a new dataframe w/all relevent variables for model 
df=pd.DataFrame()
df['county'] = texas_results['County Name']
df['county_pop'] = texas_results[' County Population ']

df['2016_votes'] = texas_results[' Total ']
df['clinton_votes'] = texas_results[' Clinton ']
df['clinton_percent'] = texas_results['% Clinton']
df['trump_votes'] = texas_results[' Trump ']
df['trump_percent'] = texas_results['% Trump']
df['2012_votes'] = texas_results[' 2012 Total Votes ']
df['obama_votes'] = texas_results[' Obama ']
df['obama_percent'] = texas_results['% Obama']
df['romney_votes'] = texas_results[' Romney ']
df['romney_percent'] = texas_results['% Romney']


In [11]:
df.reset_index()

Unnamed: 0,index,county,county_pop,2016_votes,clinton_votes,clinton_percent,trump_votes,trump_percent,2012_votes,obama_votes,obama_percent,romney_votes,romney_percent
0,2494,Anderson County,57580,16887,3358,19.89%,13165,77.96%,16167,3796,23.48%,12235,75.68%
1,2495,Andrews County,18105,4926,836,16.97%,3925,79.68%,4476,794,17.74%,3636,81.23%
2,2496,Angelina County,88255,29870,7538,25.24%,21666,72.53%,28403,7833,27.58%,20301,71.47%
3,2497,Aransas County,25350,10467,2458,23.48%,7730,73.85%,9646,2703,28.02%,6829,70.80%
4,2498,Archer County,8715,4269,394,9.23%,3785,88.66%,4163,525,12.61%,3599,86.45%
5,2499,Armstrong County,1947,1017,70,6.88%,924,90.86%,935,98,10.48%,828,88.56%
6,2500,Atascosa County,48435,13605,4635,34.07%,8598,63.20%,12704,5128,40.37%,7451,58.65%
7,2501,Austin County,29563,12255,2319,18.92%,9637,78.64%,11628,2252,19.37%,9260,79.64%
8,2502,Bailey County,7210,1784,397,22.25%,1343,75.28%,1813,466,25.70%,1336,73.69%
9,2503,Bandera County,21269,10213,1726,16.90%,8159,79.89%,9434,1859,19.71%,7423,78.68%


In [12]:
#add our dummy variables
df = pd.concat([df, dummy], axis=1)
df.head()
df.shape

(254, 14)

In [13]:
#add demographics of counties as columns to a new df 
race=pd.DataFrame()
race['County'] = demo_2015['County']
race['female_percent'] = demo_2015['Total Female']/demo_2015['Total']
race['male_percent'] = demo_2015['Total Male']/demo_2015['Total']
race['white_percent'] = demo_2015['Anglo Total']/demo_2015['Total']
race['black_percent'] = demo_2015['Black Total']/demo_2015['Total']
race['hispanic_percent'] = demo_2015['Hispanic Total']/demo_2015['Total']
race['other_race_percent'] = demo_2015['Other Total']/demo_2015['Total']
race.head()

Unnamed: 0,County,female_percent,male_percent,white_percent,black_percent,hispanic_percent,other_race_percent
87,ANDERSON COUNTY,0.392405,0.607595,0.590687,0.20896,0.173683,0.02667
174,ANDREWS COUNTY,0.499777,0.500223,0.441685,0.01317,0.524944,0.020201
261,ANGELINA COUNTY,0.505826,0.494174,0.602193,0.148573,0.222301,0.026933
348,ARANSAS COUNTY,0.507009,0.492991,0.680127,0.010713,0.270012,0.039148
435,ARCHER COUNTY,0.503612,0.496388,0.890324,0.003065,0.088222,0.018389


In [14]:
#dataframe for 2011 demographics 
race_2011=pd.DataFrame()
race_2011['county'] = demo_2011['County']
race_2011['female_percent'] = demo_2011['Total Female']/demo_2011['Total']
race_2011['male_percent'] = demo_2011['Total Male']/demo_2011['Total']
race_2011['white_percent'] = demo_2011['Anglo Total']/demo_2011['Total']
race_2011['black_percent'] = demo_2011['Black Total']/demo_2011['Total']
race_2011['hispanic_percent'] = demo_2011['Hispanic Total']/demo_2011['Total']
race_2011['other_race_percent'] = demo_2011['Other Total']/demo_2011['Total']
race_2011.head()

Unnamed: 0,county,female_percent,male_percent,white_percent,black_percent,hispanic_percent,other_race_percent
0,STATE OF TEXAS,0.503706,0.496294,0.445665,0.114774,0.382219,0.057341
87,ANDERSON COUNTY,0.392971,0.607029,0.608834,0.208456,0.162344,0.020365
174,ANDREWS COUNTY,0.500685,0.499315,0.47263,0.013375,0.493508,0.020487
261,ANGELINA COUNTY,0.509296,0.490704,0.626216,0.148253,0.203018,0.022513
348,ARANSAS COUNTY,0.504414,0.495586,0.700823,0.0108,0.250793,0.037585


In [15]:
#need to drop the first row because we don't need total texas numbers
race_2011=race_2011.drop(race_2011.index[0])

In [16]:
race.shape

(254, 7)

In [17]:
race_2011.shape

(254, 7)

In [18]:
#need to make county lowercase in both dataframes so we can merge on county 
race_2011['county'] = race_2011["county"].map(lambda x: x.lower())
race['County'] = race["County"].map(lambda x: x.lower())
df['county'] = df['county'].map(lambda x: x.lower())

In [19]:
race[race['County']=='de witt county']

Unnamed: 0,County,female_percent,male_percent,white_percent,black_percent,hispanic_percent,other_race_percent
5394,de witt county,0.474369,0.525631,0.535097,0.089364,0.355469,0.020069


In [20]:
#matching spellings of dewitt county
race.replace(to_replace = 'de witt county', value = 'dewitt county', regex=True, inplace=True)
race_2011.replace(to_replace = 'de witt county', value = 'dewitt county', regex=True, inplace=True)

In [21]:
#merge w/original df 
new_df = df.merge(race, left_on='county', right_on='County')

In [22]:
new_df.head()

Unnamed: 0,county,county_pop,2016_votes,clinton_votes,clinton_percent,trump_votes,trump_percent,2012_votes,obama_votes,obama_percent,...,romney_percent,Metropolitan Statistical Area,Micropolitan Statistical Area,County,female_percent,male_percent,white_percent,black_percent,hispanic_percent,other_race_percent
0,anderson county,57580,16887,3358,19.89%,13165,77.96%,16167,3796,23.48%,...,75.68%,0,1,anderson county,0.392405,0.607595,0.590687,0.20896,0.173683,0.02667
1,andrews county,18105,4926,836,16.97%,3925,79.68%,4476,794,17.74%,...,81.23%,0,1,andrews county,0.499777,0.500223,0.441685,0.01317,0.524944,0.020201
2,angelina county,88255,29870,7538,25.24%,21666,72.53%,28403,7833,27.58%,...,71.47%,0,1,angelina county,0.505826,0.494174,0.602193,0.148573,0.222301,0.026933
3,aransas county,25350,10467,2458,23.48%,7730,73.85%,9646,2703,28.02%,...,70.80%,1,0,aransas county,0.507009,0.492991,0.680127,0.010713,0.270012,0.039148
4,archer county,8715,4269,394,9.23%,3785,88.66%,4163,525,12.61%,...,86.45%,1,0,archer county,0.503612,0.496388,0.890324,0.003065,0.088222,0.018389


In [23]:
new_df.shape

(254, 21)

In [24]:
#function to convert percentages to decimals
def decimal(x):
    return float(x.strip('%'))/100

In [25]:
#apply function to all columns with percentages 
new_df['clinton_percent']=new_df['clinton_percent'].map(decimal)
new_df['trump_percent']=new_df['trump_percent'].map(decimal)
new_df['obama_percent']=new_df['obama_percent'].map(decimal)
new_df['romney_percent']=new_df['romney_percent'].map(decimal)

In [26]:
#reomving commas from columns needed for model 
new_df['2012_votes'] = new_df['2012_votes'].map(lambda x: x.replace(',',''))
new_df['county_pop'] = new_df['county_pop'].map(lambda x: x.replace(',',''))
new_df['2016_votes'] = new_df['2016_votes'].map(lambda x: x.replace(',',''))

In [27]:
new_df.head()

Unnamed: 0,county,county_pop,2016_votes,clinton_votes,clinton_percent,trump_votes,trump_percent,2012_votes,obama_votes,obama_percent,...,romney_percent,Metropolitan Statistical Area,Micropolitan Statistical Area,County,female_percent,male_percent,white_percent,black_percent,hispanic_percent,other_race_percent
0,anderson county,57580,16887,3358,0.1989,13165,0.7796,16167,3796,0.2348,...,0.7568,0,1,anderson county,0.392405,0.607595,0.590687,0.20896,0.173683,0.02667
1,andrews county,18105,4926,836,0.1697,3925,0.7968,4476,794,0.1774,...,0.8123,0,1,andrews county,0.499777,0.500223,0.441685,0.01317,0.524944,0.020201
2,angelina county,88255,29870,7538,0.2524,21666,0.7253,28403,7833,0.2758,...,0.7147,0,1,angelina county,0.505826,0.494174,0.602193,0.148573,0.222301,0.026933
3,aransas county,25350,10467,2458,0.2348,7730,0.7385,9646,2703,0.2802,...,0.708,1,0,aransas county,0.507009,0.492991,0.680127,0.010713,0.270012,0.039148
4,archer county,8715,4269,394,0.0923,3785,0.8866,4163,525,0.1261,...,0.8645,1,0,archer county,0.503612,0.496388,0.890324,0.003065,0.088222,0.018389


In [None]:
#export csv
new_df.to_csv('texas_data.csv', encoding='utf-8')

In [None]:
#export csv 
race_2011.to_csv('texas_data_2011.csv', encoding='utf-8')

## Lasso Regularization&K Fold Cross Validation

In [28]:
#make response variable --> voting republican is 1, voting democrat is 0
  
new_df['Y'] = (new_df['trump_percent'] > new_df['clinton_percent']).astype(int)
    
new_df['Y'].value_counts()

1    227
0     27
Name: Y, dtype: int64

Note here: 227 of Texas counties voted Republican in the 2016 election, versus 27 that voted Democrat 

In [29]:
#set X & Y
x = new_df.drop(['county', 'clinton_votes', 'trump_votes', 'obama_votes', 'romney_votes', 'County', 'Y'], axis=1)
y = new_df['Y']

#split into test & train
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state=50)

In [37]:
# K Fold CV using 5 folds 
n_folds = 5
k_fold = KFold(n_folds)
lasso_alphas =  10**np.linspace(10,-6,100)*0.5               
lasso_avg_mae = {}

#finding best alpha (regularization coefficient) to minimize mean abs error 
for alpha in lasso_alphas:
    lasso = linear_model.Lasso(alpha=alpha,tol=.1)
    avg_mae = 0
    for train, test in k_fold.split(x_train):
        lasso.fit(x_train.iloc[train], y_train.iloc[train])
        avg_mae = avg_mae + mean_absolute_error(y_train.iloc[test], np.exp(lasso.predict(x_train.iloc[test]))) 
    lasso_avg_mae[alpha] = avg_mae / n_folds
best_alpha_lasso = min(lasso_avg_mae, key=lasso_avg_mae.get)

print("Best lasso alpha: {}".format(best_alpha_lasso))

Best lasso alpha: 33670.75328875407


In [38]:
#lasso CV 
lassocv = LassoCV(alphas=None, cv=10, max_iter=100000)
lassocv.fit(x_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
    max_iter=100000, n_alphas=100, n_jobs=1, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [72]:
lasso.set_params(alpha=best_alpha_lasso)
lasso.fit(x_train, y_train)
mean_absolute_error(y_test, np.exp(lasso.predict(x_test)))

1.6322675517426248

In [73]:
lassocv.alpha_

72.52572155364

In [42]:
lasso.coef_

array([-3.06989121e-06,  3.84351268e-06, -0.00000000e+00,  0.00000000e+00,
        5.49309071e-06, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00])

Lasso regression shrank a lot of the coefficients to 0, leaving only population and # of votes as variables in the model.  Because I'm interested in the demographic variables, I went ahead with regular MLR below.

## Multiple Linear Regression Model

In [44]:
#fit multiple linear regression model 

lm = linear_model.LinearRegression()
result = lm.fit(x_train, y_train)
y_train_pred = lm.predict(x_train)
y_test_pred = lm.predict(x_test)

In [45]:
#evaluation metrics 
from sklearn.metrics import mean_squared_error

#MSE X_train 
print('The mean squared error in sample is ', mean_squared_error(y_train, y_train_pred))

#MSE X_test
print('The mean squared error out of sample is ', mean_squared_error(y_test, y_test_pred))

The mean squared error in sample is  0.01837672089459872
The mean squared error out of sample is  0.02339484001020101


In [46]:
#coefficients
lm.coef_

array([-1.49440505e-06,  2.79668473e-06, -5.36758573e+00, -3.86030684e+00,
        1.78460223e-06, -1.25386667e+01, -1.21094005e+01,  7.32627364e-02,
       -1.08319790e-02,  4.34009260e-01, -4.34009260e-01, -6.11393454e-02,
        1.80349341e+00,  2.96133637e-01, -2.03848770e+00])

In [71]:
#r-squared coefficient 
r2_score(y_train, y_train_pred)

0.7833849281048287

## Prediction for 2018 Midterm 

In [47]:
#read in data for 2018 projected demographics
demo_2018 = pd.read_csv('2018allcntyagegrp.csv')

In [48]:
demo_2018.head()

Unnamed: 0,FIPS,area_name,year,age_group,total,total_male,total_female,nh_white_total,nh_white_male,nh_white_female,...,nh_black_female,hispanic_total,hispanic_male,hispanic_female,nh_asian_total,nh_asian_male,nh_asian_female,nh_other_total,nh_other_male,nh_other_female
0,0,State of Texas,2010,ALL,25145561,12472280,12673281,11397345,5632646,5764699,...,1494415,9460921,4763753,4697168,948426,460855,487571,452044,222616,229428
1,0,State of Texas,2010,<18,6865824,3510249,3355575,2322661,1193025,1129636,...,397169,3317777,1693278,1624499,231458,117576,113882,183385,92996,90389
2,0,State of Texas,2010,18-24,2572969,1321619,1251350,994473,506474,487999,...,161682,1112368,580812,531556,92536,48072,44464,50155,24506,25649
3,0,State of Texas,2010,25-44,7071855,3544857,3526998,2934239,1478661,1455578,...,438116,2844435,1445962,1398473,339915,163030,176885,111626,53680,57946
4,0,State of Texas,2010,45-64,6033027,2959891,3073136,3385907,1674984,1710923,...,365896,1653420,815272,838148,219216,102787,116429,81653,39913,41740


In [49]:
#need to filter out State of Texas, all years except 2018 and age group ALL
demo_2018 = demo_2018[demo_2018['age_group'] == 'ALL']
demo_2018 = demo_2018[demo_2018['year']==2018]
demo_2018=demo_2018.drop(demo_2018.index[0])

In [51]:
demo_2018.shape

(254, 22)

In [52]:
demo_2018.head()

Unnamed: 0,FIPS,area_name,year,age_group,total,total_male,total_female,nh_white_total,nh_white_male,nh_white_female,...,nh_black_female,hispanic_total,hispanic_male,hispanic_female,nh_asian_total,nh_asian_male,nh_asian_female,nh_other_total,nh_other_male,nh_other_female
294,1,Anderson County,2018,ALL,58293,35999,22294,34179,19249,14930,...,3100,10658,7092,3566,320,158,162,1048,512,536
540,3,Andrews County,2018,ALL,20388,10441,9947,7540,3797,3743,...,112,12263,6354,5909,92,43,49,267,133,134
786,5,Angelina County,2018,ALL,89861,44098,45763,53784,26277,27507,...,7141,20211,10328,9883,855,389,466,1475,709,766
1032,7,Aransas County,2018,ALL,26700,13304,13396,17757,8663,9094,...,127,7700,4005,3695,472,234,238,481,239,242
1278,9,Archer County,2018,ALL,8452,4185,4267,7363,3598,3765,...,19,856,480,376,18,8,10,171,74,97


In [53]:
#matching spellings of de witt county for merging 
demo_2018.replace(to_replace = 'De Witt County', value = 'dewitt county', regex=True, inplace=True)

In [57]:
#merge with new_df to get 
demo_2018['county'] = demo_2018['area_name'].map(lambda x: x.lower())
df_2018 = demo_2018.merge(df, left_on='county', right_on='county')

In [58]:
df_2018.columns

Index(['FIPS', 'area_name', 'year', 'age_group', 'total', 'total_male',
       'total_female', 'nh_white_total', 'nh_white_male', 'nh_white_female',
       'nh_black_total', 'nh_black_male', 'nh_black_female', 'hispanic_total',
       'hispanic_male', 'hispanic_female', 'nh_asian_total', 'nh_asian_male',
       'nh_asian_female', 'nh_other_total', 'nh_other_male', 'nh_other_female',
       'county', 'county_pop', '2016_votes', 'clinton_votes',
       'clinton_percent', 'trump_votes', 'trump_percent', '2012_votes',
       'obama_votes', 'obama_percent', 'romney_votes', 'romney_percent',
       'Metropolitan Statistical Area', 'Micropolitan Statistical Area'],
      dtype='object')

In [59]:
#new dataframe for X input to predict based on updated demographics 
Xnew = pd.DataFrame()
Xnew['county_pop'] = df_2018['total']
Xnew['2016_votes'] = df_2018['2016_votes']
Xnew['clinton_percent'] = df_2018['clinton_percent']
Xnew['trump_percent'] = df_2018['trump_percent']
Xnew['2012_votes'] = df_2018['2012_votes']
Xnew['obama_percent'] = df_2018['obama_percent']
Xnew['romney_percent'] = df_2018['romney_percent']
Xnew['Metropolitan Statistical Area'] = df_2018['Metropolitan Statistical Area']
Xnew['Micropolitan Statistical Area'] = df_2018['Micropolitan Statistical Area']
Xnew['female_percent'] = df_2018['total_female']/df_2018['total']
Xnew['male_percent'] = df_2018['total_male']/df_2018['total']
Xnew['white_percent'] =  df_2018['nh_white_total']/df_2018['total']
Xnew['black_percent'] = df_2018['nh_black_total']/df_2018['total']
Xnew['hispanic_percent'] = df_2018['hispanic_total']/df_2018['total']
Xnew['other_race_percent'] = (df_2018['nh_other_total'] + df_2018['nh_asian_total'])/df_2018['total']

In [60]:
#apply function to remove %
Xnew['clinton_percent']=Xnew['clinton_percent'].map(decimal)
Xnew['trump_percent']=Xnew['trump_percent'].map(decimal)
Xnew['obama_percent']=Xnew['obama_percent'].map(decimal)
Xnew['romney_percent']=Xnew['romney_percent'].map(decimal)

In [61]:
#reomving commas from columns needed for model 
Xnew['2012_votes'] = Xnew['2012_votes'].map(lambda x: x.replace(',',''))
Xnew['2016_votes'] = Xnew['2016_votes'].map(lambda x: x.replace(',',''))

In [62]:
Xnew.head()

Unnamed: 0,county_pop,2016_votes,clinton_percent,trump_percent,2012_votes,obama_percent,romney_percent,Metropolitan Statistical Area,Micropolitan Statistical Area,female_percent,male_percent,white_percent,black_percent,hispanic_percent,other_race_percent
0,58293,16887,0.1989,0.7796,16167,0.2348,0.7568,0,1,0.382447,0.617553,0.586331,0.207366,0.182835,0.023468
1,20388,4926,0.1697,0.7968,4476,0.1774,0.8123,0,1,0.487885,0.512115,0.369825,0.011085,0.601481,0.017608
2,89861,29870,0.2524,0.7253,28403,0.2758,0.7147,0,1,0.509264,0.490736,0.598524,0.150633,0.224914,0.025929
3,26700,10467,0.2348,0.7385,9646,0.2802,0.708,1,0,0.501723,0.498277,0.665056,0.010861,0.28839,0.035693
4,8452,4269,0.0923,0.8866,4163,0.1261,0.8645,1,0,0.504851,0.495149,0.871155,0.005206,0.101278,0.022362


In [98]:
Xnew.to_csv('demo_2018.csv', encoding='utf-8')

In [63]:
x.columns

Index(['county_pop', '2016_votes', 'clinton_percent', 'trump_percent',
       '2012_votes', 'obama_percent', 'romney_percent',
       'Metropolitan Statistical Area', 'Micropolitan Statistical Area',
       'female_percent', 'male_percent', 'white_percent', 'black_percent',
       'hispanic_percent', 'other_race_percent'],
      dtype='object')

In [64]:
Xnew.columns

Index(['county_pop', '2016_votes', 'clinton_percent', 'trump_percent',
       '2012_votes', 'obama_percent', 'romney_percent',
       'Metropolitan Statistical Area', 'Micropolitan Statistical Area',
       'female_percent', 'male_percent', 'white_percent', 'black_percent',
       'hispanic_percent', 'other_race_percent'],
      dtype='object')

In [65]:
#predict y's for new data points
ynew = lm.predict(Xnew)

In [66]:
#round everything to get 1's or 0's 
ynew = abs(ynew.round(decimals=0))

In [67]:
unique, counts = numpy.unique(ynew, return_counts=True)
dict(zip(unique, counts))

{0.0: 23, 1.0: 231}

### After updated demographic variables for 2018, my model predicts that 231 Texas counties will vote Republican and 23 counties will vote Democrat 

###  Actual result of 2018 Senate midtern election: Texas 222 counties voted Republican (Cruz), 32 counties voted Democrat (O'Rourke)