In [1]:
# Importing all necessary dependancies
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
import psycopg2
from getpass import getpass

In [2]:
# more imports
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [3]:
# if you're not connected to the database you can use this to pull in the data
df = pd.read_csv('clean_survey_data_v2.csv')

In [4]:
# Input your password for the Postgres database
password2 = getpass('database password')

database password········


In [5]:
# Getting the data from the database (Postgres)
import pandas.io.sql as psql
connection = psycopg2.connect(
# make sure you use your local database name, user and host (should be localhost by default)
            host="localhost",
            database="Canadian_Election_Study",
            user="postgres",
            password=password2)
df = psql.read_sql('SELECT * FROM survey_data', connection)

In [6]:
# Filtering out the 'Don't Know' or non specific answers as well as renaming Bloc Qu<e9>b<e9>cois to Bloc Quebecois 
df = df[df.cps19_votechoice != "Don't know/ Prefer not to answer"]
df = df[df.cps19_votechoice != "Another party (please specify)"]
df['cps19_votechoice'].replace({'Bloc Qu<e9>b<e9>cois':'Bloc Quebecois'}, inplace=True)
df.head(5)

Unnamed: 0,cps19_responseid,cps19_citizenship,cps19_yob,cps19_gender,cps19_province,cps19_education,cps19_demsat,cps19_interest_gen_1,cps19_interest_elxn_1,cps19_v_likely,...,cps19_language_vietnamese,cps19_language_no_answer,cps19_language_aborginal,cps_19_language_other,cps19_employment,cps19_union,cps19_children,cps19_income_number,cps19_marital,cps19_household
1,R_3j7fAVYfVCewi3H,Canadian citizen,2000,A woman,Ontario,Some university,Fairly satisfied,10.0,10.0,Certain to vote,...,No,No,No,No,Student and working for pay,No,No,30000.0,Never Married,5
2,R_brdMqsPTvQ5t1tL,Canadian citizen,2000,A woman,Ontario,Completed secondary/ high school,Fairly satisfied,8.0,8.0,Certain to vote,...,No,No,No,No,Working for pay part-time,No,No,13000.0,Never Married,2
3,R_Wumhl7QEMURFqZH,Canadian citizen,2000,A man,Ontario,Completed secondary/ high school,Fairly satisfied,9.0,8.0,Certain to vote,...,No,No,No,No,Student and working for pay,No,No,55000.0,Never Married,3
4,R_3EH051N9vLmOOHM,Canadian citizen,2000,A woman,Ontario,Completed secondary/ high school,Fairly satisfied,10.0,10.0,Certain to vote,...,No,No,No,No,Student,No,No,190000.0,Never Married,4
5,R_1FynCZH7i2j3zEY,Canadian citizen,1999,A man,Ontario,Completed secondary/ high school,Fairly satisfied,10.0,10.0,Certain to vote,...,No,No,No,Hebrew,Working for pay part-time,No,No,10000.0,Never Married,2


In [50]:
# Dropping unnecessary columns
clean_df = df.drop(['cps19_responseid', 'cps19_ethnicity_no_answer','cps19_ethnicity_other_1', 'cps19_ethnicity_other_2', 'cps19_language_no_answer', 'cps_19_language_other'], axis=1)

In [51]:
# Dropping more unnecessary columns
clean_df.drop(['cps19_language_arabic', 'cps19_language_chinese',
       'cps19_language_filipino', 'cps19_language_german',
       'cps19_language_indian', 'cps19_language_italian',
       'cps19_language_korean', 'cps19_language_pakistani',
       'cps19_language_persian', 'cps19_language_russian',
       'cps19_language_spanish', 'cps19_language_tamil',
       'cps19_language_vietnamese','cps19_language_aborginal'], axis=1, inplace=True)

In [52]:
# Dropping more unnecessary columns
clean_df.drop('cps19_citizenship', axis=1, inplace=True)

In [53]:
# Dropping more unnecessary columns
clean_df.drop(['cps19_ethnicity_first_nat',
       'cps19_ethnicity_british', 'cps19_ethnicity_chinese',
       'cps19_ethnicity_dutch', 'cps19_ethnicity_english',
       'cps19_ethnicity_french', 'cps19_ethnicity_french_can',
       'cps19_ethnicity_german', 'cps19_ethnicity_hispanic',
       'cps19_ethnicity_indian', 'cps19_ethnicity_inuk_inuit',
       'cps19_ethnicity_irish', 'cps19_ethnicity_italian',
       'cps19_ethnicity_metis', 'cps19_ethnicity_polish',
       'cps19_ethnicity_quebecois', 'cps19_ethnicity_scottish',
       'cps19_ethnicity_ukranian'], axis=1, inplace=True)

In [54]:
# Dropping more unnecessary columns
clean_df.drop('cps19_fed_gov_sat', axis=1, inplace=True)

In [55]:
# Dropping more unnecessary columns
clean_df.drop('cps19_lib_promises', axis=1, inplace=True)

In [56]:
# Checking the vote counts for each party
clean_df['cps19_votechoice'].value_counts()

Liberal Party         4201
Conservative Party    3767
ndp                   1851
Green Party           1075
Bloc Quebecois         630
People's Party         288
Name: cps19_votechoice, dtype: int64

In [26]:
# Creating a Right/Left Dataframe splitting the parties into Right and Left instead of each specific party
right_left_df = clean_df

In [27]:
# Splitting the Left parties into one group
right_left_df['cps19_votechoice'].replace({'Liberal Party': 'Lib/ndp/green', 'ndp': 'Lib/ndp/green', 'Green Party': 'Lib/ndp/green'}, inplace=True)

In [28]:
# Splitting the Right parties into one group
right_left_df['cps19_votechoice'].replace({'Conservative Party': 'Con/PP', "People's Party": 'Con/PP'}, inplace=True)

In [29]:
# Removing Bloc Quebecois as it's a one issue party and negatively affects the rest of the data
right_left_df = clean_df[clean_df.cps19_votechoice != 'Bloc Quebecois']

In [30]:
# checking the vote counts for right/left
right_left_df['cps19_votechoice'].value_counts()

Lib/ndp/green    7127
Con/PP           4055
Name: cps19_votechoice, dtype: int64

In [31]:
# Create our features
X = right_left_df.drop('cps19_votechoice', axis=1)

In [32]:
# Use get_dummies() to create variables for text features.
dummies = X.dtypes[X.dtypes == 'object'].index.tolist()
X = pd.get_dummies(X, columns=dummies)
X.shape

(11182, 189)

In [34]:
# Create our target
y = right_left_df['cps19_votechoice']
y.shape

(11182,)

In [35]:
# Create the X,y training and test variables
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=28)
Counter(y_train)

Counter({'Lib/ndp/green': 5353, 'Con/PP': 3033})

In [36]:
Counter(y_test)

Counter({'Con/PP': 1022, 'Lib/ndp/green': 1774})

In [38]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [39]:
# Calculate the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8456758527722683

In [40]:
# Checking feature importance
feat_importances = pd.Series(brf.feature_importances_, index=X.columns)
feat_importances.sort_values(ascending=False,inplace=True)
pd.set_option("display.max_rows", None)
print(feat_importances)

cps19_lr_scale                                                                           0.079299
cps19_snclav_Not at all well                                                             0.065775
cps19_econ_fed_Worse                                                                     0.046015
cps19_groups_therm_4                                                                     0.034782
cps19_prov_gov_sat_Not at all satisfied                                                  0.032152
cps19_refugees_Fewer refugees                                                            0.030293
cps19_spend_env_Spend more                                                               0.029374
cps19_ownfinanc_fed_Worse                                                                0.024593
cps19_groups_therm_1                                                                     0.024154
cps19_econ_retro_Got worse                                                               0.024108
cps19_spend_imm_min_

In [57]:
# Creating the lib/npd only dataframe
lib_ndp_df = clean_df

In [58]:
# Removing other parties from the dataframe
lib_ndp_df = lib_ndp_df[lib_ndp_df.cps19_votechoice != 'Bloc Quebecois'] 
lib_ndp_df = lib_ndp_df[lib_ndp_df.cps19_votechoice != 'Conservative Party']
lib_ndp_df = lib_ndp_df[lib_ndp_df.cps19_votechoice != 'Green Party']
lib_ndp_df = lib_ndp_df[lib_ndp_df.cps19_votechoice != "People's Party"]

In [59]:
# Checking the value counts
lib_ndp_df['cps19_votechoice'].value_counts()

Liberal Party    4201
ndp              1851
Name: cps19_votechoice, dtype: int64

In [60]:
# Create our features
X = lib_ndp_df.drop('cps19_votechoice', axis=1)
X.shape

(6052, 44)

In [61]:
# Create our target
y = lib_ndp_df['cps19_votechoice']

In [62]:
# Use get_dummies() to create variables for text features.
dummies = X.dtypes[X.dtypes == 'object'].index.tolist()
X = pd.get_dummies(X, columns=dummies)
X.shape

(6052, 189)

In [63]:
# Create the X,y training and test variables
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=28)
Counter(y_train)

Counter({'Liberal Party': 3144, 'ndp': 1395})

In [65]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [66]:
# Calculate the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7312839217248419

In [67]:
# check the feature importances
feat_importances = pd.Series(brf.feature_importances_, index=X.columns)
feat_importances.sort_values(ascending=False,inplace=True)
pd.set_option("display.max_rows", None)
print(feat_importances)

cps19_groups_therm_5                                                                     0.042405
cps19_yob                                                                                0.042137
cps19_income_number                                                                      0.035957
cps19_snclav_Not at all well                                                             0.031425
cps19_groups_therm_4                                                                     0.031139
cps19_econ_fed_Better                                                                    0.028295
cps19_lr_scale                                                                           0.027246
cps19_groups_therm_3                                                                     0.026826
cps19_groups_therm_2                                                                     0.025710
cps19_groups_therm_1                                                                     0.024750
cps19_interest_gen_1

## Clustering Analysis

In [27]:
# 
data_scaled = MinMaxScaler().fit_transform(dummies)
print(data_scaled)

[[0.98765432 1.         1.         ... 1.         0.         0.        ]
 [0.98765432 0.8        0.8        ... 0.         0.         1.        ]
 [0.98765432 0.9        0.8        ... 0.         0.         0.        ]
 ...
 [0.75308642 0.7        0.6        ... 0.         0.         0.        ]
 [0.45679012 0.8        0.9        ... 0.         0.         0.        ]
 [0.87654321 0.8        0.8        ... 0.         0.         0.        ]]


In [28]:
pca = PCA(n_components = 0.95)
pca.fit(data_scaled)
reduced = pca.transform(data_scaled)

In [29]:
reduced

array([[-1.64079541, -0.5502324 ,  1.59910587, ...,  0.06997976,
        -0.02430385, -0.13628978],
       [ 0.06132046, -0.71788866,  0.96958823, ..., -0.13973024,
         0.05845579,  0.15821406],
       [ 0.8711823 , -0.3102759 ,  0.69872579, ..., -0.15093701,
         0.06698659, -0.12218421],
       ...,
       [-0.28782484,  1.96354745, -0.20501786, ..., -0.26306415,
         0.18242836, -0.15280002],
       [-1.06311638,  1.70154262, -1.19000938, ..., -0.53468526,
         0.20458856, -0.63514211],
       [ 2.16345293,  0.18820945,  1.03230782, ..., -0.28867613,
         0.1443293 ,  0.17755354]])

In [40]:
pca = PCA(n_components=3)
survey_pca = pca.fit_transform(dummies)
survey_pca

array([[-6.13794164e+04, -8.28547273e+00, -7.97067185e+01],
       [-7.83794166e+04, -1.14447210e+01, -4.69936741e+01],
       [-3.63794172e+04, -1.02184238e+01,  6.66269873e+01],
       ...,
       [ 8.32058337e+03, -8.75162242e+00, -5.06205463e+01],
       [ 5.86205830e+04, -1.04068567e+01,  6.79360952e+00],
       [-2.93794174e+04, -1.21605096e+01,  7.75392985e+01]])

In [30]:
pcs_df = pd.DataFrame(data=reduced, index=df.index)
pcs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,87,88,89,90,91,92
1,-1.640795,-0.550232,1.599106,0.678852,-0.385271,-0.243675,-0.789281,-0.282779,-1.353168,0.224241,...,0.00767,-0.017931,-0.14994,-0.042816,0.123578,-7.9e-05,0.158081,0.06998,-0.024304,-0.13629
2,0.06132,-0.717889,0.969588,-1.247994,-1.671025,0.275632,0.644154,0.447882,-0.515109,0.148403,...,-0.149313,-0.069356,-0.018642,-0.039658,-0.013591,-0.188013,0.003429,-0.13973,0.058456,0.158214
3,0.871182,-0.310276,0.698726,-0.920029,0.706398,0.423011,-2.17935,1.342329,0.307033,-0.571784,...,-0.017529,-0.039094,-0.089139,-0.060951,-0.034886,0.029153,-0.06199,-0.150937,0.066987,-0.122184
4,-0.807417,-0.387583,1.805666,0.219846,-0.860312,-0.448553,-1.068617,-0.339758,-0.63308,0.783322,...,-0.079156,-0.084417,-0.150157,-0.134444,0.421827,-0.038113,-0.181143,0.072567,0.198737,0.343894
5,2.458146,0.402594,0.672011,0.190909,-0.095224,1.223992,-0.688247,0.79484,-0.208003,-0.699209,...,-0.334489,0.040719,-0.064281,-0.014206,0.035489,-0.16161,-0.1523,0.04852,-0.075948,-0.210286


In [31]:
inertia = []
k = list(range(1,11))
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [44]:
model = KMeans(n_clusters=4, random_state=0)
model.fit(pcs_df)
predictions = model.predict(pcs_df)

In [45]:
df["class"] = predictions
df.head()

Unnamed: 0,cps19_responseid,cps19_citizenship,cps19_yob,cps19_gender,cps19_province,cps19_education,cps19_demsat,cps19_interest_gen_1,cps19_interest_elxn_1,cps19_v_likely,...,cps19_language_no_answer,cps19_language_aborginal,cps_19_language_other,cps19_employment,cps19_union,cps19_children,cps19_income_number,cps19_marital,cps19_household,class
1,R_3j7fAVYfVCewi3H,Canadian citizen,2000,A woman,Ontario,Some university,Fairly satisfied,10.0,10.0,Certain to vote,...,No,No,No,Student and working for pay,No,No,30000.0,Never Married,5,0
2,R_brdMqsPTvQ5t1tL,Canadian citizen,2000,A woman,Ontario,Completed secondary/ high school,Fairly satisfied,8.0,8.0,Certain to vote,...,No,No,No,Working for pay part-time,No,No,13000.0,Never Married,2,0
3,R_Wumhl7QEMURFqZH,Canadian citizen,2000,A man,Ontario,Completed secondary/ high school,Fairly satisfied,9.0,8.0,Certain to vote,...,No,No,No,Student and working for pay,No,No,55000.0,Never Married,3,0
4,R_3EH051N9vLmOOHM,Canadian citizen,2000,A woman,Ontario,Completed secondary/ high school,Fairly satisfied,10.0,10.0,Certain to vote,...,No,No,No,Student,No,No,190000.0,Never Married,4,3
5,R_1FynCZH7i2j3zEY,Canadian citizen,1999,A man,Ontario,Completed secondary/ high school,Fairly satisfied,10.0,10.0,Certain to vote,...,No,No,Hebrew,Working for pay part-time,No,No,10000.0,Never Married,2,0


In [39]:
df.to_csv("clusters.csv")

In [46]:
df["class"].value_counts()

0    8944
3    2813
2      50
1       5
Name: class, dtype: int64