In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve
from copy import deepcopy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, roc_curve)
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [56]:
# read in the data as a pandas dataframe
df = pd.read_csv("data.csv", skiprows = 0)


In [50]:
size = df.shape
print ("\033[1m" + "Rows     : " +  "\033[0m", size[0])
print ("\033[1m" + "Columns  : " + "\033[0m", size[1])
print ("\033[1m" + "\nColumn Names : \n" + "\033[0m", df.columns.tolist())
print ("\033[1m" + "\nNull values :  " + "\033[0m", df.isnull().sum().values.sum())

[1mRows     : [0m 19
[1mColumns  : [0m 17
[1m
Column Names : 
[0m ['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'y1', 'y2']
[1m
Null values :  [0m 0


In [3]:
## converting categorical variables into their dummy variables respectively
## to avoid perfect multicollinearity, we will drop a column - essentially only creating k - 1 columns 
## see https://towardsdatascience.com/one-hot-encoding-multicollinearity-and-the-dummy-variable-trap-b5840be3c41a for more info
def one_hot_encoding(df,cat_columns):
    #iterate through the categorical columns
    for cat in cat_columns:
        cat_value_list = sorted(df[cat].unique())
        length_list = len(cat_value_list)
        for x in range(0,length_list - 1):
            listValue = cat_value_list[x]
            encoded_name = str(cat) + "_encoded_" + str(listValue)
            df[encoded_name] = 0
            df.loc[df[cat] == listValue, encoded_name] = 1
        del df[cat]
    return df

In [52]:
cts_features = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14']


In [54]:
## Y1

df_logreg = deepcopy(df)
df_logreg = df_logreg[['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'y1']]

# Splitting 66/33
Y = df_logreg["y1"]
X = deepcopy(df_logreg)
X = one_hot_encoding(X, cts_features)
X_train, X_test = train_test_split(X, test_size= (1/3), random_state = 123)

# merge back for hot encoding  model performs better 
Y_train = X_train["y1"]
del X_train["y1"]

Y_test = X_test["y1"]
del X_test["y1"]

lr = LogisticRegression(fit_intercept=True, penalty='l2', random_state=123)
log_reg_model = lr.fit(X_train, Y_train)
y_pred = log_reg_model.predict(X_test)

print("Performance Metrix for Logistics Regression (Imbalanced)")
print("Accuracy                 : ", accuracy_score(Y_test, y_pred))
print("F1 Score                 : ", f1_score(Y_test, y_pred))
print("Recall                   : ", recall_score(Y_test, y_pred))
print("Area under Curve         : ", roc_auc_score(Y_test,y_pred))
print("\n","Confusion Matrix","\n", confusion_matrix(Y_test, y_pred),"\n")


Performance Metrix for Logistics Regression (Imbalanced)
Accuracy                 :  0.5714285714285714
F1 Score                 :  0.0
Recall                   :  0.0
Area under Curve         :  0.5

 Confusion Matrix 
 [[4 0]
 [3 0]] 



In [55]:
## Pc Check

df_logreg = deepcopy(df)
df_logreg = df_logreg[['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'y2']]

# Splitting 66/33
Y = df_logreg["y2"]
X = deepcopy(df_logreg)
X = one_hot_encoding(X, cts_features)
X_train, X_test = train_test_split(X, test_size= (1/3), random_state = 123)

# merge back for hot encoding  model performs better 
Y_train = X_train["y2"]
del X_train["y2"]

Y_test = X_test["y2"]
del X_test["y2"]

lr = LogisticRegression(fit_intercept=True, penalty='l2', random_state=123)
log_reg_model = lr.fit(X_train, Y_train)
y_pred = log_reg_model.predict(X_test)

print("Performance Metrix for Logistics Regression (Imbalanced)")
print("Accuracy                 : ", accuracy_score(Y_test, y_pred))
print("F1 Score                 : ", f1_score(Y_test, y_pred))
print("Recall                   : ", recall_score(Y_test, y_pred))
print("Area under Curve         : ", roc_auc_score(Y_test,y_pred))
print("\n","Confusion Matrix","\n", confusion_matrix(Y_test, y_pred),"\n")


Performance Metrix for Logistics Regression (Imbalanced)
Accuracy                 :  0.7142857142857143
F1 Score                 :  0.0
Recall                   :  0.0
Area under Curve         :  0.5

 Confusion Matrix 
 [[5 0]
 [2 0]] 

