In [55]:
### 1. Data import and general settings
import pandas as pd 
import numpy as np

from sagemaker import get_execution_role

role = get_execution_role()
bucket = 'kgml-data'
data_key='KGML_2.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
df=pd.read_csv(data_location,sep=';',decimal=",")


pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',500)

In [56]:
######## Chapter 6.2: Data Preparation #########

###      Includes following: 
#        1. Data Type conversions
#        2. Handling Missing Values - 6.2.1 
#        3. Variable Selection - 6.2.1 
#        4. Training/test split - 7.1.2, has to be done before undersampling. 
#        5. Balancing Target Distribution - 6.2.3 
#           5.1 SMOTE 
#           5.2 ADASYN
#           5.3 Under Sampling 

### 1. Data Type conversions: 

# Comvertimg variable types & setting number of decimals: 

# Transform int64 variables to categorical vairables 
df['MPostnr'] = df['MPostnr'].astype('category')
df['MBy'] = df['MBy'].astype('category')
df['AEPostnr'] = df['AEPostnr'].astype('category')
df['AEBy'] = df['AEBy'].astype('category')
df['BPostnr'] = df['BPostnr'].astype('category')
df['BBy'] = df['BBy'].astype('category')
df['B2Postnr'] = df['B2Postnr'].astype('category')
df['B2By'] = df['B2By'].astype('category')
df['B3Postnr'] = df['B3Postnr'].astype('category')
df['B3By'] = df['B3By'].astype('category')
df['Alder']=df['Alder'].astype('category')

# Transform integer variables to floats 
df['IndexKvartal']=df['IndexKvartal'].astype('float64')
df['IndexAar']=df['IndexAar'].astype('float64')
df['offentligeYdelser']=df['offentligeYdelser'].astype('float64')
df['privateIndtaegter']=df['privateIndtaegter'].astype('float64')

# Transform string/character variables to categorical variables 
df['MBy']=df['MBy'].cat.codes
df['AEBy']=df['AEBy'].cat.codes
df['BBy']=df['BBy'].cat.codes
df['B2By']=df['B2By'].cat.codes
df['B3By']=df['B3By'].cat.codes

# Set all cell values in DF to include only two decimals: 
df=df.round(2)

### 2. Handling Missing Values

# Set empty cells ("") to NAN
# df=df.replace(r'^\s*$', np.nan, regex=True)

### 3. Variable Selection 

# Variables being dropped due to correlation 
df=df.drop(['id','AEPostnr','BPostnr','B2Postnr','B3Postnr','MPostnr','IndexKvartal','B3By','B3Alder',
            'B2Alder','UDKTypeEkstraBTilskudEnlig','KTypePensKom','MKommune,,,,'],axis=1)


df=df.replace({'KÃ¿n': {'Mand':0, 'Kvinde':1}})
df=df.replace({'Alder': {'0 til 9':1,'10 til 19':1, '20 til 29':1, '30 til 39':2, '40 til 49':3, 
                         '50 til 59':4, '60 til 69':5, '70 til 79':5, '80 til 89':5, 
                         '90 til 90':5}})

In [57]:
### 4. Creating training/test split: 

from sklearn.model_selection import train_test_split

x=df.iloc[:,1:]
y=df.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state = 2)

print('Number of observations and columns in x train', x_train.shape)
print('Number of observations and columns in y train', y_train.shape)
print('Number of observations and columns in x test' , x_test.shape)
print('Number of observations and columns in y test' , y_test.shape)

Number of observations and columns in x train (727, 34)
Number of observations and columns in y train (727,)
Number of observations and columns in x test (486, 34)
Number of observations and columns in y test (486,)


In [58]:
### 5. Balancing dataset 
# From above it is clear that our distribution is highly skewed - SMOTE:  
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler

In [59]:
## 5.1 SMOTE 
sm=SMOTE(random_state=2)
x_train_smo,y_train_smo=sm.fit_sample(x_train,y_train)

## 5.2 ADASYN
sma=ADASYN(random_state=3)
x_train_ada,y_train_ada=sma.fit_sample(x_train,y_train)

## 5.3 Under Sampling
rus=RandomUnderSampler(random_state=4)
x_train_us,y_train_us=rus.fit_sample(x_train,y_train)
x_test_us,y_test_us=rus.fit_sample(x_test,y_test)

In [60]:
## Converting variables back to Data frames: 
# y_train_smo=pd.DataFrame(y_train_smo)
# y_train_smo =y_train_smo.rename(columns={0: 'Target'})
# 
# y_train_ada=pd.DataFrame(y_train_ada)
# y_train_ada =y_train_ada.rename(columns={0: 'Target'})
# 
# y_train_us=pd.DataFrame(y_train_us)
# y_train_us =y_train_us.rename(columns={0: 'Target'})

x_train_smo=pd.DataFrame(x_train_smo,columns=[x_test])
x_train_ada=pd.DataFrame(x_train_ada,columns=[x_test])
x_train_us=pd.DataFrame(x_train_us,columns=[x_test])
x_test_us=pd.DataFrame(x_test_us,columns=[x_test])

In [61]:
### 2. Logistic Regression 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Fitting Models:  
log_smo = LogisticRegression(solver='liblinear',penalty='l1',max_iter=4000, random_state=42)
log_smo.fit(x_train_smo, y_train_smo)
   
log_ada = LogisticRegression(solver='liblinear',penalty='l1',max_iter=4000, random_state=42)
log_ada.fit(x_train_ada, y_train_ada)

log_us = LogisticRegression(solver='liblinear',penalty='l1',max_iter=4000, random_state=42)
log_us.fit(x_train_us, y_train_us) 


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
# Creating Predictions: 
y_pred_smo = log_smo.predict(x_test)
print('Accuracy of logistic regression classifier - SMO on test set: {:.3f}'.format(accuracy_score(y_test, y_pred_smo)))

y_pred_ada = log_ada.predict(x_test)
print('Accuracy of logistic regression classifier - ADA on test set: {:.3f}'.format(accuracy_score(y_test, y_pred_ada)))

y_pred_us = log_us.predict(x_test_us)
print('Accuracy of logistic regression classifier - US on test set: {:.3f}'.format(accuracy_score(y_test_us, y_pred_us)))

Accuracy of logistic regression classifier - SMO on test set: 0.693
Accuracy of logistic regression classifier - ADA on test set: 0.700
Accuracy of logistic regression classifier - US on test set: 0.458


In [63]:
# Confusion matrices
print('Confusion matrix - Logistic Regression with Smote:')
print(confusion_matrix(y_test, y_pred_smo))

print('\n','Confusion matrix - Logistic Regression with Adasyn:')
print(confusion_matrix(y_test, y_pred_ada))

print('\n','Confusion matrix - Logistic Regression with UnderSampling:')
print(confusion_matrix(y_test_us, y_pred_us))


Confusion matrix - Logistic Regression with Smote:
[[326 136]
 [ 13  11]]

 Confusion matrix - Logistic Regression with Adasyn:
[[328 134]
 [ 12  12]]

 Confusion matrix - Logistic Regression with UnderSampling:
[[ 9 15]
 [11 13]]


In [64]:
# Accuracy measures: 
from sklearn.metrics import classification_report
print('Performance Measures - Logistic Regression with Smote:')
print(classification_report(y_test, y_pred_smo)) 

print('\n','Performance Measures - Logistic Regression with Adasyn:')
print(classification_report(y_test,y_pred_ada))

print('\n','Performance Measures - Logistic Regression with US:')
print(classification_report(y_test_us,y_pred_us))


Performance Measures - Logistic Regression with Smote:
              precision    recall  f1-score   support

           0       0.96      0.71      0.81       462
           1       0.07      0.46      0.13        24

    accuracy                           0.69       486
   macro avg       0.52      0.58      0.47       486
weighted avg       0.92      0.69      0.78       486


 Performance Measures - Logistic Regression with Adasyn:
              precision    recall  f1-score   support

           0       0.96      0.71      0.82       462
           1       0.08      0.50      0.14        24

    accuracy                           0.70       486
   macro avg       0.52      0.60      0.48       486
weighted avg       0.92      0.70      0.78       486


 Performance Measures - Logistic Regression with US:
              precision    recall  f1-score   support

           0       0.45      0.38      0.41        24
           1       0.46      0.54      0.50        24

    accuracy   