In [None]:
# Import our dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [None]:
#load in files
clean2015 = pd.read_csv("https://raw.githubusercontent.com/hajaf/final-Project/Clarissa/cleaned_2015.csv")
clean2016 = pd.read_csv("https://raw.githubusercontent.com/hajaf/final-Project/Clarissa/cleaned_2016.csv")

In [None]:
clean2015.head()

Unnamed: 0,Country,Region,Ranking,Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
0,Switzerland,Western Europe,1,7.587,1.397,1.35,0.941,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,1.302,1.402,0.948,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,1.325,1.361,0.875,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,1.459,1.331,0.885,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,1.326,1.323,0.906,0.63297,0.32957,0.45811,2.45176


In [None]:
clean2016.head()

Unnamed: 0,Country,Region,Ranking,Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
0,Denmark,Western Europe,1,7.526,1.442,1.164,0.795,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,1.527,1.145,0.863,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,1.427,1.183,0.867,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,1.577,1.127,0.796,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,1.406,1.135,0.811,0.57104,0.41004,0.25492,2.82596


## Drop unnecessary columns

In [None]:
clean2015.drop(["Country","Region", "Ranking","Dystopia_Residual"], axis=1,inplace=True)
clean2016.drop(["Country","Region", "Ranking","Dystopia_Residual"], axis=1,inplace=True)

Decided to use columns that provided a more realistic accuracy of what a person may use to measure the happiness of a country. 

In [None]:
#Check that columns dropped
clean2015.head()

Unnamed: 0,Score,Economy,Family,Health,Freedom,Trust,Generosity
0,7.587,1.397,1.35,0.941,0.66557,0.41978,0.29678
1,7.561,1.302,1.402,0.948,0.62877,0.14145,0.4363
2,7.527,1.325,1.361,0.875,0.64938,0.48357,0.34139
3,7.522,1.459,1.331,0.885,0.66973,0.36503,0.34699
4,7.427,1.326,1.323,0.906,0.63297,0.32957,0.45811


In [None]:
clean2016.head()

Unnamed: 0,Score,Economy,Family,Health,Freedom,Trust,Generosity
0,7.526,1.442,1.164,0.795,0.57941,0.44453,0.36171
1,7.509,1.527,1.145,0.863,0.58557,0.41203,0.28083
2,7.501,1.427,1.183,0.867,0.56624,0.14975,0.47678
3,7.498,1.577,1.127,0.796,0.59609,0.35776,0.37895
4,7.413,1.406,1.135,0.811,0.57104,0.41004,0.25492


Attempted to encode the 'Region' column however it did not improve the accuracy of the linear regression models, therefore no longer included the process in preprocessing the data.

## Slice the Data

In [None]:
#Set the feature and target columns
X_clean2015 = clean2015.drop(columns=["Score"]).values
y_clean2015 = clean2015.Score.values

X_clean2016 = clean2016.drop(columns=["Score"]).values
y_clean2016 = clean2016.Score.values

In [None]:
#Test Train Split
from sklearn.model_selection import train_test_split
#2015
X_2015_train, X_2015_test, y_2015_train, y_2015_test = train_test_split(X_clean2015,y_clean2015, test_size=0.25, random_state=1)
#2016
X_2016_train, X_2016_test, y_2016_train, y_2016_test = train_test_split(X_clean2016,y_clean2016, test_size=0.25, random_state=1)

## Linear Regression Model

In [None]:
#Create a Linear Regression Instance for 2015 data
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_2015_train,y_2015_train)

LinearRegression()

In [None]:
#check accuracy
print("print accuracy:",reg.score(X_2015_test,y_2015_test))

print accuracy: 0.7490915998735155


In [None]:
#Run the model for 2016 data
reg.fit(X_2016_train,y_2016_train)

LinearRegression()

In [None]:
#check accuracy
print("print accuracy:",reg.score(X_2016_test,y_2016_test))

print accuracy: 0.7323748081718856


## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
#Create model instance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
#Train the model
rf.fit(X_2015_train,y_2015_train)
#Print the accuracy
print("accuracy :",rf.score(X_2015_test,y_2015_test))

accuracy : 0.7319475144104186


In [None]:
#2016 data 
#Train the model
rf.fit(X_2016_train,y_2016_train)
#Print the accuracy
print("accuracy :",rf.score(X_2016_test,y_2016_test))

accuracy : 0.690199358009666


In [109]:
#calculate the importances 
importances = rf.feature_importances_
importances

array([0.52598541, 0.11427736, 0.18714415, 0.1171723 , 0.03476758,
       0.0206532 ])

In [114]:
#sort the features by importance
columns = ['Economy',	'Family',	'Health',	'Freedom',	'Trust',	'Generosity']
sorted(zip(rf.feature_importances_, columns), reverse=True)

[(0.525985412316079, 'Economy'),
 (0.18714415075519017, 'Health'),
 (0.11717229767017519, 'Freedom'),
 (0.1142773628204708, 'Family'),
 (0.03476757859499093, 'Trust'),
 (0.02065319784309403, 'Generosity')]

## SVM

In [None]:
from sklearn.linear_model import SGDRegressor 

svm = SGDRegressor()
svm.fit(X_2015_train,y_2015_train)

#test
print("accuracy:",svm.score(X_2015_test,y_2015_test))

accuracy: 0.7578245616774909


In [None]:
#2016 data
svm.fit(X_2016_train,y_2016_train)

#test
print("accuracy:",svm.score(X_2016_test,y_2016_test))

accuracy: 0.6912387793725638
