# Imports

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pickle


# Preparing dataset

In [105]:
df = pd.read_csv('default.csv')[['R0024200', 'R0024300', 'R0024400', 'R0017200', 'R0218001', 'R0214700', 'R0217501']]
df = df.rename(columns={'R0024200': '2.5$', 'R0024300': '3.5$', 'R0024400': '5.0$', 'R0017200' : 'graduation', 'R0218001' : 'childrens', 'R0214700' : 'race', 'R0217501': 'married'})
df.head()

Unnamed: 0,2.5$,3.5$,5.0$,graduation,childrens,race,married
0,0,1,-4,12,0,3,0
1,0,0,0,9,0,3,0
2,0,0,0,11,0,3,0
3,0,0,0,11,0,3,0
4,0,0,1,14,0,3,0


In [106]:
wage = []

for index, row in df.iterrows():
    if(row['2.5$'] == 1):
        wage.append(1)
    elif(row['3.5$'] == 1):
        wage.append(2)
    elif(row['5.0$'] == 1):
        wage.append(3)
    else:
        wage.append(0)

df['wage'] = wage

df = df[['race', 'graduation', 'childrens', 'married', 'wage']]
df.head()

Unnamed: 0,race,graduation,childrens,married,wage
0,3,12,0,0,2
1,3,9,0,0,0
2,3,11,0,0,0
3,3,11,0,0,0
4,3,14,0,0,3


# Model training

In [107]:
# select features and target variable from the dataset
X = df[['race', 'graduation', 'childrens', 'married']]
y = df['wage']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40, stratify=y)

In [108]:
# Logistic Regression Classification
log_reg = LogisticRegression(random_state=40,solver='liblinear')
# fit model
log_reg.fit(X_train,y_train)

y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

model_report_train = classification_report(y_train,y_train_pred)
model_report_test = classification_report(y_test,y_test_pred)

print('Classification Report for Train:\n',model_report_train)
print('Classification Report for Test:\n',model_report_test)



Classification Report for Train:
               precision    recall  f1-score   support

           0       0.37      0.77      0.50      2774
           1       0.38      0.34      0.36      2108
           2       0.31      0.23      0.27      2486
           3       0.00      0.00      0.00      2146

    accuracy                           0.36      9514
   macro avg       0.27      0.33      0.28      9514
weighted avg       0.27      0.36      0.29      9514

Classification Report for Test:
               precision    recall  f1-score   support

           0       0.37      0.77      0.50       925
           1       0.39      0.35      0.37       703
           2       0.34      0.25      0.29       829
           3       0.00      0.00      0.00       715

    accuracy                           0.37      3172
   macro avg       0.27      0.34      0.29      3172
weighted avg       0.28      0.37      0.30      3172



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Save model

In [109]:
#save model to file
with open('model.pkl', 'wb') as fp:
    pickle.dump(log_reg, fp)