In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn import tree

In [3]:
# Loading data
file_path = Path("./clean_survey_data.csv")
df_cps = pd.read_csv(file_path)
df_cps.head()

Unnamed: 0.1,Unnamed: 0,cps19_ResponseId,cps19_citizenship,cps19_yob,cps19_gender,cps19_province,cps19_education,cps19_demsat,cps19_interest_gen_1,cps19_interest_elxn_1,...,cps19_language_vietnamese,cps19_language_no_answer,cps19_language_aborginal,cps_19_language_other,cps19_employment,cps19_union,cps19_children,cps19_income_number,cps19_marital,cps19_household
0,4,R_27WeMQ1asip2cMD,Canadian citizen,2000,A woman,Ontario,Completed secondary/ high school,Fairly satisfied,8.0,6.0,...,No,No,No,No,Working for pay part-time,No,No,56000.0,Never Married,6.0
1,13,R_3j7fAVYfVCewi3H,Canadian citizen,2000,A woman,Ontario,Some university,Fairly satisfied,10.0,10.0,...,No,No,No,No,Student and working for pay,No,No,30000.0,Never Married,5.0
2,20,R_brdMqsPTvQ5t1tL,Canadian citizen,2000,A woman,Ontario,Completed secondary/ high school,Fairly satisfied,8.0,8.0,...,No,No,No,No,Working for pay part-time,No,No,13000.0,Never Married,2.0
3,21,R_Wumhl7QEMURFqZH,Canadian citizen,2000,A man,Ontario,Completed secondary/ high school,Fairly satisfied,9.0,8.0,...,No,No,No,No,Student and working for pay,No,No,55000.0,Never Married,3.0
4,26,R_3EH051N9vLmOOHM,Canadian citizen,2000,A woman,Ontario,Completed secondary/ high school,Fairly satisfied,10.0,10.0,...,No,No,No,No,Student,No,No,190000.0,Never Married,4.0


In [4]:
target = ["cps19_votechoice"]

In [5]:
X = df_cps.copy()
X = X.drop("cps19_votechoice", axis=1)
X.head()
# Define the target set.
y = df_cps["cps19_votechoice"].values
y[:10]

array(["Don't know/ Prefer not to answer", 'Liberal Party', 'ndp',
       'Conservative Party', 'Liberal Party', 'Conservative Party',
       'Liberal Party', 'Green Party', 'ndp', 'Conservative Party'],
      dtype=object)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded_df = X.copy()
for colName in encoded_df:
    encoded_df[colName] = le.fit_transform(encoded_df[colName])
    
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
cps_encoded_scaled = data_scaler.fit_transform(encoded_df)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(cps_encoded_scaled, y, random_state=78, train_size=0.80)
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10582, 86)
(2646, 86)
(10582,)
(2646,)


In [8]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train, y_train)

In [9]:
predictions = model.predict(X_test)

In [10]:
correct = 0
incorrect = 0
for prediction, actual in zip(predictions, y_test):
    if prediction == actual:
        correct += 1
    else:
        incorrect += 1
        
print("correct count:", correct)
print("incorrect count:", incorrect)
print("success rate:", correct / (correct + incorrect))

correct count: 1241
incorrect count: 1405
success rate: 0.4690098261526833


In [11]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=["Liberal Party", "Conservative Party", "ndp", "Don't know/ Prefer not to answer", "Green Party", "Bloc Qu<e9>b<e9>cois", "People's Party", "Another Party (please specify)"], columns=["Liberal Party", "Conservative Party", "ndp", "Don't know/ Prefer not to answer", "Green Party", "Bloc Qu<e9>b<e9>cois", "People's Party", "Another party (please specify)"])
cm_df

Unnamed: 0,Liberal Party,Conservative Party,ndp,Don't know/ Prefer not to answer,Green Party,Bloc Qu<e9>b<e9>cois,People's Party,Another party (please specify)
Liberal Party,1,1,5,0,2,2,0,2
Conservative Party,1,36,17,12,9,17,4,14
ndp,5,24,498,64,40,68,28,57
Don't know/ Prefer not to answer,3,22,55,40,23,58,5,28
Green Party,2,12,36,23,37,44,3,57
Bloc Qu<e9>b<e9>cois,2,40,49,80,55,525,8,111
People's Party,0,2,29,4,3,4,7,6
Another Party (please specify),6,14,58,36,45,108,2,97
