In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
# Read the csv file into a pandas DataFrame for transforming (xform all the non-numeric data for ML prep)
xform_pd = pd.read_csv('Voter_final_clean.csv')
xform_pd.head(2)

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted
0,2000,Alabama,59.00%,1912592,3241682,2411000,15.30%,489573
1,2004,Alabama,57.40%,1890317,3292608,2418000,26.00%,863695


In [3]:
from sklearn.preprocessing import LabelEncoder 
labelencoder= LabelEncoder() #initializing an object of class LabelEncoder
xform_pd['State'] = labelencoder.fit_transform(xform_pd['State']) #fitting and transforming the desired categorical column.
xform_pd.head(2)

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted
0,2000,0,59.00%,1912592,3241682,2411000,15.30%,489573
1,2004,0,57.40%,1890317,3292608,2418000,26.00%,863695


In [4]:
labelencoder.inverse_transform(xform_pd['State'])

array(['Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alaska',
       'Alaska', 'Alaska', 'Alaska', 'Alaska', 'Arizona', 'Arizona',
       'Arizona', 'Arizona', 'Arizona', 'Arkansas', 'Arkansas',
       'Arkansas', 'Arkansas', 'Arkansas', 'California', 'California',
       'California', 'California', 'California', 'Colorado', 'Colorado',
       'Colorado', 'Colorado', 'Colorado', 'Connecticut', 'Connecticut',
       'Connecticut', 'Connecticut', 'Connecticut', 'Delaware',
       'Delaware', 'Delaware', 'Delaware', 'Delaware',
       'DistrictOfColumbia', 'DistrictOfColumbia', 'DistrictOfColumbia',
       'DistrictOfColumbia', 'DistrictOfColumbia', 'Florida', 'Florida',
       'Florida', 'Florida', 'Florida', 'Georgia', 'Georgia', 'Georgia',
       'Georgia', 'Georgia', 'Hawaii', 'Hawaii', 'Hawaii', 'Hawaii',
       'Hawaii', 'Idaho', 'Idaho', 'Idaho', 'Idaho', 'Idaho', 'Illinois',
       'Illinois', 'Illinois', 'Illinois', 'Illinois', 'Indiana',
       'Indiana', 'Indiana', 'I

In [5]:
#add new column
xform_pd["Re-election"] = 0 

In [6]:
xform_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Year                           255 non-null    int64 
 1   State                          255 non-null    int32 
 2   General:VEP Turnout Rate       255 non-null    object
 3   General:Total Ballots Counted  255 non-null    object
 4   VEP                            255 non-null    object
 5   Total registered               255 non-null    object
 6   Primary:VEP Turnout Rate       255 non-null    object
 7   Primary:Total Ballots Counted  255 non-null    object
 8   Re-election                    255 non-null    int64 
dtypes: int32(1), int64(2), object(6)
memory usage: 17.1+ KB


In [7]:
# label election year 0 and relection year 1
xform_pd.loc[xform_pd["Year"].isin([2004,2012]),"Re-election"] = 1
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.00%,1912592,3241682,2411000,15.30%,489573,0
1,2004,0,57.40%,1890317,3292608,2418000,26.00%,863695,1
2,2008,0,61.00%,2105622,3454510,2438000,32.20%,1088835,0
3,2012,0,59.00%,2088138,3539217,2556000,24.80%,871025,1
4,2016,0,59.10%,2134061,3609447,2526000,35.00%,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.40%,221685,361078,240000,2.00%,7096,0
251,2004,50,66.30%,245789,370785,265000,2.00%,7545,1
252,2008,50,63.10%,256035,405732,270000,2.60%,9953,0
253,2012,50,59.00%,250701,425142,268000,0.30%,1308,1


In [8]:
#convert data type, remove % and replace
# https://www.geeksforgeeks.org/python-pandas-series-astype-to-convert-data-type-of-series/
#      data["Salary"]= data["Salary"].astype(int) 
#      data["Number"]= data["Number"].astype(str)

xform_pd["General:VEP Turnout Rate"]= xform_pd["General:VEP Turnout Rate"].str.replace("%","").astype(float) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.30%,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.00%,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.20%,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.80%,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.00%,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.00%,7096,0
251,2004,50,66.3,245789,370785,265000,2.00%,7545,1
252,2008,50,63.1,256035,405732,270000,2.60%,9953,0
253,2012,50,59.0,250701,425142,268000,0.30%,1308,1


In [9]:
xform_pd["Primary:VEP Turnout Rate"]= xform_pd["Primary:VEP Turnout Rate"].str.replace("%","").astype(float) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [10]:
xform_pd["General:Total Ballots Counted"]= xform_pd["General:Total Ballots Counted"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [11]:
xform_pd["VEP"]= xform_pd["VEP"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [12]:
xform_pd["Total registered"]= xform_pd["Total registered"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [13]:
xform_pd["Primary:Total Ballots Counted"]= xform_pd["Primary:Total Ballots Counted"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [14]:
# prepare transformed data file for Machine Learning
con_xform_pd = xform_pd

In [17]:
# https://github.com/pandas-profiling/pandas-profiling, 
#       used "pip install pandas-profiling[notebook]" -- failed with error
#       used "conda install -c conda-forge pandas-profiling"  -- successful
profile = ProfileReport(con_xform_pd, title='Pandas Profiling Report', explorative=True)

TypeError: _plot_histogram() got an unexpected keyword argument 'title'

In [None]:
profile.to_file("your_report_SVR.html")

In [None]:
ml_data = con_xform_pd.drop(columns= ["General:VEP Turnout Rate", "Primary:VEP Turnout Rate"])
ml_data.head(2)

In [None]:
ml_data.to_csv("ml_data.csv", index=False)

In [None]:
# Assign X (data) and y (target)

### BEGIN SOLUTION
r = ml_data['Re-election']
X = ml_data.drop(columns=['General:Total Ballots Counted'])
y = (ml_data["General:Total Ballots Counted"]/1000000).values.reshape(-1, 1)
print(X.shape, y.shape, r.shape)
### END SOLUTION

In [None]:
X.head(2)

In [None]:
# Split the data into training and testing

### BEGIN SOLUTION
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
### END SOLUTION

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
### BEGIN SOLUTION
X_scaler = StandardScaler().fit(X_train[["VEP","Total registered","Primary:Total Ballots Counted"]])

### END SOLUTION
X_scaler 

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models

### BEGIN SOLUTION
X_train_scaled = X_scaler.transform(X_train[["VEP","Total registered","Primary:Total Ballots Counted"]])
X_test_scaled = X_scaler.transform(X_test[["VEP","Total registered","Primary:Total Ballots Counted"]])

### END SOLUTION
X_train_scaled

In [None]:
X_train[["VEP","Total registered","Primary:Total Ballots Counted"]] = X_train_scaled
X_test[["VEP","Total registered","Primary:Total Ballots Counted"]] = X_test_scaled

In [None]:
X_test

In [None]:
# Create a LinearRegression model and fit it to the scaled training data

### BEGIN SOLUTION
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
### END SOLUTION

In [None]:
# Used X_test_scaled, y_test_scaled, and model.predict(X_test_scaled) to calculate MSE and R2

### BEGIN SOLUTION
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test)

MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# Make predictions using a fitted model
# Plot the difference between the model predicted values and actual y values, versus the model predicted values
# Hint: You can predict values of X training and testing data using the model.predict() method on a fitted model

### BEGIN SOLUTION
predictions = model.predict(X_test)
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
plt.title("Residual Plot")
plt.show()
### END SOLUTION

In [None]:
X_test_predict = X_test

In [None]:
X_test_predict["Predictions"] = model.predict(X_test)

In [None]:
X_test_predict["Actuals"] = y_test

In [None]:
X_test_predict["Residual Testing"] = X_test_predict["Predictions"]-X_test_predict["Actuals"] 

In [None]:
X_test_predict.head(2)

In [None]:
 X_test_predict_sort = X_test_predict.sort_values(by=["Residual Testing"], ascending=False)
 X_test_predict_sort.head(5)

In [None]:
X_test_train = model.predict(X_train) - y_train
X_test_train