In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
# Read the csv file into a pandas DataFrame for transforming (xform all the non-numeric data for ML prep)
xform_pd = pd.read_csv('Voter_final_clean.csv')
xform_pd.head(2)

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted
0,2000,Alabama,59.00%,1912592,3241682,2411000,15.30%,489573
1,2004,Alabama,57.40%,1890317,3292608,2418000,26.00%,863695


In [3]:
from sklearn.preprocessing import LabelEncoder 
labelencoder= LabelEncoder() #initializing an object of class LabelEncoder
xform_pd['State'] = labelencoder.fit_transform(xform_pd['State']) #fitting and transforming the desired categorical column.
xform_pd.head(2)

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted
0,2000,0,59.00%,1912592,3241682,2411000,15.30%,489573
1,2004,0,57.40%,1890317,3292608,2418000,26.00%,863695


In [4]:
labelencoder.inverse_transform(xform_pd['State'])

array(['Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alaska',
       'Alaska', 'Alaska', 'Alaska', 'Alaska', 'Arizona', 'Arizona',
       'Arizona', 'Arizona', 'Arizona', 'Arkansas', 'Arkansas',
       'Arkansas', 'Arkansas', 'Arkansas', 'California', 'California',
       'California', 'California', 'California', 'Colorado', 'Colorado',
       'Colorado', 'Colorado', 'Colorado', 'Connecticut', 'Connecticut',
       'Connecticut', 'Connecticut', 'Connecticut', 'Delaware',
       'Delaware', 'Delaware', 'Delaware', 'Delaware',
       'DistrictOfColumbia', 'DistrictOfColumbia', 'DistrictOfColumbia',
       'DistrictOfColumbia', 'DistrictOfColumbia', 'Florida', 'Florida',
       'Florida', 'Florida', 'Florida', 'Georgia', 'Georgia', 'Georgia',
       'Georgia', 'Georgia', 'Hawaii', 'Hawaii', 'Hawaii', 'Hawaii',
       'Hawaii', 'Idaho', 'Idaho', 'Idaho', 'Idaho', 'Idaho', 'Illinois',
       'Illinois', 'Illinois', 'Illinois', 'Illinois', 'Indiana',
       'Indiana', 'Indiana', 'I

In [5]:
#add new column
xform_pd["Re-election"] = 0 

In [6]:
xform_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Year                           255 non-null    int64 
 1   State                          255 non-null    int32 
 2   General:VEP Turnout Rate       255 non-null    object
 3   General:Total Ballots Counted  255 non-null    object
 4   VEP                            255 non-null    object
 5   Total registered               255 non-null    object
 6   Primary:VEP Turnout Rate       255 non-null    object
 7   Primary:Total Ballots Counted  255 non-null    object
 8   Re-election                    255 non-null    int64 
dtypes: int32(1), int64(2), object(6)
memory usage: 17.1+ KB


In [7]:
# label election year 0 and relection year 1
xform_pd.loc[xform_pd["Year"].isin([2004,2012]),"Re-election"] = 1
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.00%,1912592,3241682,2411000,15.30%,489573,0
1,2004,0,57.40%,1890317,3292608,2418000,26.00%,863695,1
2,2008,0,61.00%,2105622,3454510,2438000,32.20%,1088835,0
3,2012,0,59.00%,2088138,3539217,2556000,24.80%,871025,1
4,2016,0,59.10%,2134061,3609447,2526000,35.00%,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.40%,221685,361078,240000,2.00%,7096,0
251,2004,50,66.30%,245789,370785,265000,2.00%,7545,1
252,2008,50,63.10%,256035,405732,270000,2.60%,9953,0
253,2012,50,59.00%,250701,425142,268000,0.30%,1308,1


In [8]:
#convert data type, remove % and replace
# https://www.geeksforgeeks.org/python-pandas-series-astype-to-convert-data-type-of-series/
#      data["Salary"]= data["Salary"].astype(int) 
#      data["Number"]= data["Number"].astype(str)

xform_pd["General:VEP Turnout Rate"]= xform_pd["General:VEP Turnout Rate"].str.replace("%","").astype(float) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.30%,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.00%,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.20%,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.80%,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.00%,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.00%,7096,0
251,2004,50,66.3,245789,370785,265000,2.00%,7545,1
252,2008,50,63.1,256035,405732,270000,2.60%,9953,0
253,2012,50,59.0,250701,425142,268000,0.30%,1308,1


In [9]:
xform_pd["Primary:VEP Turnout Rate"]= xform_pd["Primary:VEP Turnout Rate"].str.replace("%","").astype(float) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [10]:
xform_pd["General:Total Ballots Counted"]= xform_pd["General:Total Ballots Counted"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [11]:
xform_pd["VEP"]= xform_pd["VEP"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [12]:
xform_pd["Total registered"]= xform_pd["Total registered"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [13]:
xform_pd["Primary:Total Ballots Counted"]= xform_pd["Primary:Total Ballots Counted"].str.replace(",","").astype(int) 
xform_pd

Unnamed: 0,Year,State,General:VEP Turnout Rate,General:Total Ballots Counted,VEP,Total registered,Primary:VEP Turnout Rate,Primary:Total Ballots Counted,Re-election
0,2000,0,59.0,1912592,3241682,2411000,15.3,489573,0
1,2004,0,57.4,1890317,3292608,2418000,26.0,863695,1
2,2008,0,61.0,2105622,3454510,2438000,32.2,1088835,0
3,2012,0,59.0,2088138,3539217,2556000,24.8,871025,1
4,2016,0,59.1,2134061,3609447,2526000,35.0,1260551,0
...,...,...,...,...,...,...,...,...,...
250,2000,50,61.4,221685,361078,240000,2.0,7096,0
251,2004,50,66.3,245789,370785,265000,2.0,7545,1
252,2008,50,63.1,256035,405732,270000,2.6,9953,0
253,2012,50,59.0,250701,425142,268000,0.3,1308,1


In [14]:
# ======================================================
# ======================================================

# prepare transformed data file for Machine Learning
con_xform_pd = xform_pd

In [1]:
# https://github.com/pandas-profiling/pandas-profiling, 
#       used "pip install pandas-profiling[notebook]" -- failed with error
#       used "conda install -c conda-forge pandas-profiling"  -- successful
#profile = ProfileReport(con_xform_pd, title='Pandas Profiling Report', explorative=True)

In [48]:
# profile.to_file("your_report_Hubbel.html")

In [17]:
ml_data = con_xform_pd.drop(columns= ["General:VEP Turnout Rate", "Primary:VEP Turnout Rate"])
ml_data.head(2)

Unnamed: 0,Year,State,General:Total Ballots Counted,VEP,Total registered,Primary:Total Ballots Counted,Re-election
0,2000,0,1912592,3241682,2411000,489573,0
1,2004,0,1890317,3292608,2418000,863695,1


In [18]:
ml_data.to_csv("ml_data_Hubbel.csv", index=False)

In [28]:
# ======================================================
# ======================================================
ml_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   Year                           255 non-null    int64
 1   State                          255 non-null    int32
 2   General:Total Ballots Counted  255 non-null    int32
 3   VEP                            255 non-null    int32
 4   Total registered               255 non-null    int32
 5   Primary:Total Ballots Counted  255 non-null    int32
 6   Re-election                    255 non-null    int64
dtypes: int32(5), int64(2)
memory usage: 9.1 KB


In [29]:

# load and summarize the dataset
from numpy import loadtxt
# load data
#dataset = loadtxt('ml_data_Hubbel.csv', delimiter=",")
#dataset.info()


In [34]:
### BEGIN SOLUTION
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor

# Assign X (data) and y (target)
# split into inputs and outputs
# X, y = dataset[:, :-1], dataset[:, -1]
# summarize dataset
# print(X.shape, y.shape)

#   r = ml_data['Re-election']
X = ml_data.drop(columns=['General:Total Ballots Counted'])
y = (ml_data["General:Total Ballots Counted"]/1000000).values.reshape(-1, 1)
# print(X.shape, y.shape, r.shape)
print(X.shape, y.shape)
### END SOLUTION

(255, 6) (255, 1)


In [35]:
X.head(2)

Unnamed: 0,Year,State,VEP,Total registered,Primary:Total Ballots Counted,Re-election
0,2000,0,3241682,2411000,489573,0
1,2004,0,3292608,2418000,863695,1


In [None]:
# Split the data into training and testing

### BEGIN SOLUTION
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
### END SOLUTION

In [36]:
# prepare the model with input scaling
pipeline = Pipeline(
    steps=[('normalize', MinMaxScaler()), ('model', HuberRegressor())]
    )
pipeline

Pipeline(memory=None,
         steps=[('normalize', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('model',
                 HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True,
                                max_iter=100, tol=1e-05, warm_start=False))],
         verbose=False)

In [39]:
# prepare the model with target scaling
model = TransformedTargetRegressor(
    regressor=pipeline, transformer=MinMaxScaler()
    )
model

TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
                           regressor=Pipeline(memory=None,
                                              steps=[('normalize',
                                                      MinMaxScaler(copy=True,
                                                                   feature_range=(0,
                                                                                  1))),
                                                     ('model',
                                                      HuberRegressor(alpha=0.0001,
                                                                     epsilon=1.35,
                                                                     fit_intercept=True,
                                                                     max_iter=100,
                                                                     tol=1e-05,
                                                                 

In [40]:
# evaluate model
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(
    model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores

array([-0.10261851, -0.15835043, -0.16871926, -0.13770621, -0.18926916,
       -0.11502964, -0.17384114, -0.16362395, -0.23922   , -0.13131329])

In [42]:
# normalizing input and output variables for regression.
from numpy import mean
from numpy import absolute
# convert scores to positive
scores = absolute(scores)
scores

array([0.10261851, 0.15835043, 0.16871926, 0.13770621, 0.18926916,
       0.11502964, 0.17384114, 0.16362395, 0.23922   , 0.13131329])

In [43]:
# Summarize the result

# If multioutput is 'raw_values', 
#        then mean absolute error is returned for each output separately. 
# If multioutput is 'uniform_average' or an ndarray of weights, 
#        then the weighted average of all output errors is returned. 
# MAE output is non-negative floating point.

s_mean = mean(scores)
print('Mean MAE: %.3f' % (s_mean))

Mean MAE: 0.158


In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
### BEGIN SOLUTION
X_scaler = StandardScaler().fit(
    X_train[["VEP","Total registered","Primary:Total Ballots Counted"]]
    )

### END SOLUTION
X_scaler 

StandardScaler(copy=True, with_mean=True, with_std=True)

In [24]:
# Transform the training and testing data using the X_scaler and y_scaler models

### BEGIN SOLUTION
X_train_scaled = X_scaler.transform(X_train[["VEP","Total registered","Primary:Total Ballots Counted"]])
X_test_scaled = X_scaler.transform(X_test[["VEP","Total registered","Primary:Total Ballots Counted"]])

### END SOLUTION
X_train_scaled

array([[-5.16074871e-01, -5.32449062e-01, -4.98119942e-01],
       [-5.23315427e-01, -5.42022419e-01, -5.93002082e-01],
       [ 1.57315001e-01,  3.04687818e-01, -3.05908989e-01],
       [ 1.07494758e+00,  1.12622367e+00,  6.44868360e-01],
       [ 1.44318420e-01,  1.31303686e-01,  2.58707319e-02],
       [ 7.81275377e-01,  8.98590517e-01,  1.24426859e+00],
       [-6.48832418e-01, -6.81367948e-01, -5.30566238e-01],
       [ 1.15523177e+00,  1.33506468e+00,  2.00141396e+00],
       [-6.33281163e-01, -6.85977342e-01, -6.36475453e-01],
       [ 1.90378327e-01,  2.69585509e-01,  7.25272815e-01],
       [-1.54131430e-01, -7.50553412e-02, -5.77408820e-01],
       [-7.39275386e-02,  2.49330535e-02, -4.43237566e-02],
       [ 7.55488931e-01,  9.64540309e-01,  2.07181751e-01],
       [-6.50248124e-01, -7.02996643e-01, -3.61850612e-01],
       [-2.44026596e-01, -1.88162781e-01, -5.14510930e-01],
       [-4.22961691e-02, -2.36874219e-03, -6.06785925e-01],
       [-8.77867757e-01, -8.83117581e-01

In [25]:
X_train[["VEP","Total registered","Primary:Total Ballots Counted"]] = X_train_scaled
X_test[["VEP","Total registered","Primary:Total Ballots Counted"]] = X_test_scaled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [30]:
X_test

Unnamed: 0,Year,State,VEP,Total registered,Primary:Total Ballots Counted,Re-election
190,2000,38,1.179477,1.045027,1.190641,0
6,2004,1,-0.884044,-0.909710,-0.702975,1
79,2016,15,-0.445182,-0.440616,-0.438909,0
205,2000,41,-0.861013,-0.894818,-0.627641,0
117,2008,23,-0.098280,0.011105,-0.501564,0
...,...,...,...,...,...,...
60,2000,12,-0.782411,-0.826387,-0.620767,0
101,2004,20,-0.084629,-0.079310,-0.221108,1
172,2008,34,-0.871465,-0.886663,-0.694039,0
248,2012,49,0.013855,0.148323,0.127111,1


In [None]:
### BEGIN SOLUTION
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test)

MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

NameError: name 'model' is not defined