# 1 The objective of this notebook is to train and test a baseline model - Logistic Regression

Reference metric: 73%

In [70]:
#---------Importing libraries---------#

#---Data analysis---#
import pandas as pd
import numpy as np


#---Data splitting---#
from sklearn.model_selection import train_test_split
#cross validation
from sklearn.model_selection import cross_val_score

#---classification models---#
from sklearn.linear_model import LogisticRegression

#---evaluation---#
from sklearn.metrics import accuracy_score, classification_report

#---utils---#
import os


## 1.1 Raw data

In [71]:
#importing raw data
df = pd.read_csv('data/ACME-HappinessSurvey2020.csv')

#splitting data into 80% train and 20% test
X = df.drop(columns=['Y'])
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [72]:
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)

print('Base line Accuracy score: {:.2%} '.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Base line Accuracy score: 46.15% 
              precision    recall  f1-score   support

           0       0.56      0.33      0.42        15
           1       0.41      0.64      0.50        11

    accuracy                           0.46        26
   macro avg       0.48      0.48      0.46        26
weighted avg       0.49      0.46      0.45        26



Okay, our baseline is 46% accuracy. Let's see if we can do better.

## 1.2 Data version 1

In [73]:
#importing data
df_v1 = pd.read_csv('data/base_v1.csv')

#splitting data into 80% train and 20% test
X = df_v1.drop(columns=['Y'])
y = df_v1['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df_v1.shape

(109, 7)

In [74]:
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)

print('Base line Accuracy score: {:.2%} '.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Base line Accuracy score: 72.73% 
              precision    recall  f1-score   support

           0       0.83      0.50      0.62        10
           1       0.69      0.92      0.79        12

    accuracy                           0.73        22
   macro avg       0.76      0.71      0.71        22
weighted avg       0.75      0.73      0.71        22



72.7% ! almost a 30% improvement over the baseline. Let's see if we can do better with other models.

In [75]:
#features importances

importance = LR.coef_[0]
# summarize feature importance
for col, values in zip(X.columns, importance):
    print('Feature: %s, Score: %.2f' % (col, values))


Feature: X1, Score: 0.43
Feature: X2, Score: -0.12
Feature: X3, Score: 0.23
Feature: X4, Score: -0.09
Feature: X5, Score: 0.07
Feature: X6, Score: 0.35


As expected, by the correlation map

1.2.1 Traning with cross validation

In [87]:
#traning with cross validation, k=5

LR = LogisticRegression(random_state=42)
scores = cross_val_score(LR, X, y, cv=5)
print('Cross-Validation Accuracy Scores', scores)


Cross-Validation Accuracy Scores [0.54545455 0.63636364 0.72727273 0.63636364 0.33333333]


## 1.3 Data version 2

In [77]:
#importing data
df_v2 = pd.read_csv('data/base_v2.csv')

#splitting data into 80% train and 20% test
X = df_v2.drop(columns=['Y'])
y = df_v2['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df_v2.shape

(109, 5)

In [78]:
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)

print('Base line Accuracy score: {:.2%} '.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Base line Accuracy score: 63.64% 
              precision    recall  f1-score   support

           0       0.60      0.60      0.60        10
           1       0.67      0.67      0.67        12

    accuracy                           0.64        22
   macro avg       0.63      0.63      0.63        22
weighted avg       0.64      0.64      0.64        22



Okay, not good to try and remove features that are bad correlated with target. ( Why? )

## 1.3 Data version 3

In [79]:
#importing data
df= pd.read_csv('data/base_v3.csv')

#splitting data into 80% train and 20% test
X = df.drop(columns=['Y'])
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df.shape

(109, 7)

In [80]:
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)

print('Base line Accuracy score: {:.2%} '.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Base line Accuracy score: 54.55% 
              precision    recall  f1-score   support

           0       0.50      0.30      0.37        10
           1       0.56      0.75      0.64        12

    accuracy                           0.55        22
   macro avg       0.53      0.53      0.51        22
weighted avg       0.53      0.55      0.52        22



## 1.4 Data version 4

In [81]:
#importing data
df= pd.read_csv('data/base_v4.csv')

#splitting data into 80% train and 20% test
X = df.drop(columns=['Y'])
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df.shape

(109, 5)

In [82]:
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)

print('Base line Accuracy score: {:.2%} '.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Base line Accuracy score: 59.09% 
              precision    recall  f1-score   support

           0       0.57      0.40      0.47        10
           1       0.60      0.75      0.67        12

    accuracy                           0.59        22
   macro avg       0.59      0.57      0.57        22
weighted avg       0.59      0.59      0.58        22



## 1.5 Data version 5

In [83]:
#importing data
df= pd.read_csv('data/base_v5.csv')

#splitting data into 80% train and 20% test
X = df.drop(columns=['Y'])
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df.shape

(109, 6)

In [84]:
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)

print('Base line Accuracy score: {:.2%} '.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Base line Accuracy score: 68.18% 
              precision    recall  f1-score   support

           0       0.71      0.50      0.59        10
           1       0.67      0.83      0.74        12

    accuracy                           0.68        22
   macro avg       0.69      0.67      0.66        22
weighted avg       0.69      0.68      0.67        22



good try, but not good enough.