# Setting up environment and importing data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf

In [2]:
sample = pd.read_csv("dsa3101-ay2122-sem1-assignment3/sample.csv")
sample2 = pd.read_csv("dsa3101-ay2122-sem1-assignment3/sample2.csv")
train = pd.read_csv("dsa3101-ay2122-sem1-assignment3/train.csv")
test = pd.read_csv("dsa3101-ay2122-sem1-assignment3/test.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                3000 non-null   int64  
 1   CHK_ACCT          3000 non-null   int64  
 2   DURATION          3000 non-null   int64  
 3   HISTORY           3000 non-null   int64  
 4   NEW_CAR           3000 non-null   int64  
 5   USED_CAR          3000 non-null   int64  
 6   FURNITURE         3000 non-null   int64  
 7   RADIO_TV          3000 non-null   int64  
 8   EDUCATION         2985 non-null   float64
 9   RETRAINING        3000 non-null   int64  
 10  AMOUNT            2994 non-null   float64
 11  SAV_ACCT          3000 non-null   int64  
 12  EMPLOYMENT        3000 non-null   int64  
 13  INSTALL_RATE      3000 non-null   int64  
 14  MALE_DIV          3000 non-null   int64  
 15  MALE_SINGLE       3000 non-null   int64  
 16  MALE_MAR_or_WID   3000 non-null   int64  


Drop ID column which won't be useful

In [4]:
IDs = test['ID'].to_frame()
train = train.drop('ID', axis = 1)
test = test.drop('ID',axis = 1)
train.head()

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
3,1,8,2,0,0,0,1,0.0,0,2575.0,...,23,0,1.0,0,1,1.0,1,0,0,0
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1


# Data Cleaning

## Check for duplicates

In [5]:
sum(train.duplicated())

0

No duplicates.

## Check for NAs

In [6]:
train.isnull().sum()

CHK_ACCT             0
DURATION             0
HISTORY              0
NEW_CAR              0
USED_CAR             0
FURNITURE            0
RADIO_TV             0
EDUCATION           15
RETRAINING           0
AMOUNT               6
SAV_ACCT             0
EMPLOYMENT           0
INSTALL_RATE         0
MALE_DIV             0
MALE_SINGLE          0
MALE_MAR_or_WID      0
CO_APPLICANT         0
GUARANTOR            0
PRESENT_RESIDENT     0
REAL_ESTATE          0
PROP_UNKN_NONE       0
AGE                  0
OTHER_INSTALL        0
RENT                24
OWN_RES              0
NUM_CREDITS          0
JOB                 24
NUM_DEPENDENTS       0
TELEPHONE            0
FOREIGN              0
RESPONSE             0
dtype: int64

NAs appear in Education, Amount, Rent and Job columns
Here I presume that people are trying to hide their purpose, indicating that their values for these columns are deeemd to be 'undesirable' i.e not for Education
Hence I make the following changes :
- Education : 0 (Credit is not for Education)
- Amount : Remove 
- Rent : 1 (Applicant rents)
- Job : 0 (Unemployed / Unskilled)

In [7]:
#Change Education
train['EDUCATION'] = train['EDUCATION'].fillna(0)

In [8]:
#Change Rent
train['RENT'] = train['RENT'].fillna(1)

In [9]:
#Change Job
train['JOB'] = train['JOB'].fillna(0)

In [10]:
#remove rows with NA amount
train = train.dropna().reset_index(drop = True)
train

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
3,1,8,2,0,0,0,1,0.0,0,2575.0,...,23,0,1.0,0,1,1.0,1,0,0,0
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2989,3,15,1,0,0,0,1,0.0,0,1569.0,...,34,1,0.0,1,1,1.0,2,0,0,1
2990,0,18,2,0,0,0,1,0.0,0,1936.0,...,23,0,1.0,0,2,1.0,1,0,0,1
2991,0,30,2,0,1,0,0,0.0,0,9999999.0,...,40,0,0.0,1,1,3.0,1,1,0,1
2992,3,12,2,0,0,0,1,0.0,0,804.0,...,38,0,0.0,1,1,2.0,1,0,0,1


## Inspecting the numerical columns

In [11]:
train['DURATION'].value_counts().to_dict()

{24: 257,
 12: 234,
 18: 151,
 9: 145,
 10: 136,
 15: 125,
 23: 119,
 11: 116,
 6: 109,
 13: 105,
 8: 94,
 36: 88,
 25: 82,
 22: 80,
 14: 80,
 16: 70,
 7: 69,
 17: 69,
 21: 68,
 19: 53,
 20: 45,
 30: 44,
 48: 42,
 5: 37,
 33: 37,
 4: 28,
 42: 28,
 26: 28,
 40: 27,
 39: 27,
 38: 26,
 34: 26,
 41: 23,
 37: 23,
 45: 22,
 35: 21,
 44: 21,
 29: 20,
 32: 20,
 31: 20,
 49: 19,
 27: 18,
 28: 17,
 46: 15,
 47: 13,
 43: 11,
 60: 10,
 53: 9,
 56: 9,
 50: 8,
 3: 7,
 1: 6,
 51: 5,
 54: 5,
 55: 5,
 2: 4,
 52: 4,
 58: 3,
 62: 3,
 59: 2,
 61: 2,
 68: 1,
 65: 1,
 63: 1,
 72: 1}

In [12]:
#train['DURATION'] = train['DURATION']/12

In [13]:
train.head()

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
3,1,8,2,0,0,0,1,0.0,0,2575.0,...,23,0,1.0,0,1,1.0,1,0,0,0
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1


Duration column looks okay.

In [14]:
train['AMOUNT'].value_counts().to_dict()

{9999999.0: 59,
 1532.0: 5,
 1582.0: 5,
 909.0: 5,
 1082.0: 5,
 2096.0: 4,
 1940.0: 4,
 1231.0: 4,
 1092.0: 4,
 1262.0: 4,
 1928.0: 4,
 1169.0: 4,
 802.0: 4,
 1920.0: 4,
 1047.0: 4,
 1444.0: 4,
 1344.0: 4,
 1123.0: 4,
 3195.0: 4,
 1769.0: 4,
 1605.0: 3,
 1110.0: 3,
 1871.0: 3,
 1424.0: 3,
 1287.0: 3,
 960.0: 3,
 2337.0: 3,
 2303.0: 3,
 1564.0: 3,
 1048.0: 3,
 1085.0: 3,
 731.0: 3,
 1390.0: 3,
 1274.0: 3,
 1040.0: 3,
 1413.0: 3,
 914.0: 3,
 2009.0: 3,
 1559.0: 3,
 958.0: 3,
 1238.0: 3,
 2133.0: 3,
 1603.0: 3,
 1993.0: 3,
 1108.0: 3,
 1773.0: 3,
 1368.0: 3,
 1064.0: 3,
 1052.0: 3,
 1222.0: 3,
 1474.0: 3,
 2028.0: 3,
 1471.0: 3,
 1756.0: 3,
 1884.0: 3,
 937.0: 3,
 2148.0: 3,
 1365.0: 3,
 1553.0: 3,
 2728.0: 3,
 867.0: 3,
 1199.0: 3,
 1258.0: 3,
 1659.0: 3,
 3777.0: 3,
 1437.0: 3,
 1106.0: 3,
 1217.0: 3,
 5595.0: 3,
 6055.0: 3,
 1516.0: 3,
 1282.0: 3,
 2122.0: 3,
 1371.0: 3,
 1721.0: 3,
 2600.0: 3,
 3617.0: 3,
 1563.0: 3,
 1737.0: 3,
 2164.0: 3,
 966.0: 3,
 1992.0: 3,
 1663.0: 3,
 2241.0: 

Some rows contain negative amount, while some are absurdly high (9999999).

Let threshold be the max amount in test data.

In [15]:
train = train[(train['AMOUNT'] >= 0) & (train['AMOUNT'] <= max(test['AMOUNT']))]
#train['AMOUNT'] = train['AMOUNT']/1000
train

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
3,1,8,2,0,0,0,1,0.0,0,2575.0,...,23,0,1.0,0,1,1.0,1,0,0,0
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,3,13,2,0,0,0,1,0.0,0,1409.0,...,64,0,0.0,1,1,2.0,1,0,0,1
2989,3,15,1,0,0,0,1,0.0,0,1569.0,...,34,1,0.0,1,1,1.0,2,0,0,1
2990,0,18,2,0,0,0,1,0.0,0,1936.0,...,23,0,1.0,0,2,1.0,1,0,0,1
2992,3,12,2,0,0,0,1,0.0,0,804.0,...,38,0,0.0,1,1,2.0,1,0,0,1


70 rows removed.

In [16]:
train['INSTALL_RATE'].value_counts().to_dict()

{3: 1076, 2: 761, 4: 540, 1: 475, 0: 69}

In [17]:
test['INSTALL_RATE'].value_counts().to_dict()

{3: 373, 2: 275, 4: 177, 1: 175}

No instance of 0 installment rate in test set. Drop?

In [18]:
train = train[(train['INSTALL_RATE'] != 0)]
train

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1
5,1,17,2,0,0,0,1,0.0,0,742.0,...,31,0,1.0,1,1,2.0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,3,13,2,0,0,0,1,0.0,0,1409.0,...,64,0,0.0,1,1,2.0,1,0,0,1
2989,3,15,1,0,0,0,1,0.0,0,1569.0,...,34,1,0.0,1,1,1.0,2,0,0,1
2990,0,18,2,0,0,0,1,0.0,0,1936.0,...,23,0,1.0,0,2,1.0,1,0,0,1
2992,3,12,2,0,0,0,1,0.0,0,804.0,...,38,0,0.0,1,1,2.0,1,0,0,1


In [19]:
train['AGE'].value_counts().to_dict()

{23: 137,
 27: 132,
 28: 125,
 24: 121,
 25: 119,
 26: 114,
 33: 110,
 37: 99,
 35: 97,
 34: 94,
 31: 91,
 32: 91,
 30: 87,
 36: 86,
 22: 84,
 38: 81,
 29: 78,
 39: 75,
 40: 68,
 42: 68,
 43: 55,
 21: 53,
 44: 51,
 41: 49,
 49: 43,
 46: 40,
 20: 39,
 45: 38,
 48: 37,
 47: 37,
 50: 29,
 999: 28,
 51: 27,
 57: 25,
 52: 25,
 55: 23,
 53: 23,
 66: 21,
 54: 21,
 61: 21,
 58: 21,
 19: 19,
 59: 17,
 64: 16,
 56: 16,
 18: 14,
 62: 14,
 63: 13,
 60: 12,
 67: 9,
 65: 8,
 70: 8,
 68: 7,
 72: 6,
 69: 6,
 17: 5,
 74: 5,
 16: 4,
 71: 2,
 77: 2,
 76: 2,
 14: 1,
 78: 1,
 73: 1,
 75: 1}

Age of 999 seems unreasonable, hence I remove the 27 rows.

In [20]:
train = train[train['AGE'] != 999]
#train['AGE'] = normalize(train['AGE'])
train

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1
5,1,17,2,0,0,0,1,0.0,0,742.0,...,31,0,1.0,1,1,2.0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,3,13,2,0,0,0,1,0.0,0,1409.0,...,64,0,0.0,1,1,2.0,1,0,0,1
2989,3,15,1,0,0,0,1,0.0,0,1569.0,...,34,1,0.0,1,1,1.0,2,0,0,1
2990,0,18,2,0,0,0,1,0.0,0,1936.0,...,23,0,1.0,0,2,1.0,1,0,0,1
2992,3,12,2,0,0,0,1,0.0,0,804.0,...,38,0,0.0,1,1,2.0,1,0,0,1


In [21]:
train['NUM_CREDITS'].value_counts().to_dict()

{1: 2272, 2: 475, 3: 73, 4: 4}

In [22]:
test['NUM_CREDITS'].value_counts().to_dict()

{1: 863, 2: 128, 3: 8, 4: 1}

Number of credits looks okay.

In [23]:
train['NUM_DEPENDENTS'].value_counts().to_dict()

{1: 2367, 2: 457}

In [24]:
test['NUM_DEPENDENTS'].value_counts().to_dict()

{1: 891, 2: 109}

Number of dependents looks okay.

## Inspecting categorical columns : Is the data corroborative?

There are 3 columns regarding the marital status of the applicant
- Sum of these 3 columns cannot be more than 1. Sum of more than 1 indicates problematic responses (should be either one of 3 marital statuses) -> person may have misinterpreted single being same as divorced. in this case we change the 1 in single to 0

In [25]:
train[train['MALE_DIV'] + train['MALE_SINGLE'] + train['MALE_MAR_or_WID'] == 3]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
70,1,32,4,0,1,1,0,0.0,0,7709.0,...,30,0,0.0,1,2,3.0,1,1,0,0
106,0,21,4,0,0,0,1,0.0,0,5004.0,...,48,0,0.0,1,1,3.0,1,1,0,0
1242,3,12,2,1,1,0,1,0.0,0,2623.0,...,24,0,1.0,1,1,2.0,1,1,0,1
1628,0,33,0,0,0,0,0,0.0,1,6657.0,...,26,0,0.0,1,1,3.0,2,1,0,0
2049,3,24,2,0,0,0,0,0.0,0,877.0,...,22,0,0.0,1,1,1.0,1,1,0,1


In [26]:
test[test['MALE_DIV'] + test['MALE_MAR_or_WID'] + test['MALE_SINGLE'] == 3]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,PROP_UNKN_NONE,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN


Hard to ascertain what their status is, or explain why these people have indicated 'Yes' to all 3 options. So we drop these 5 rows

No such instances in the test set as well. Drop.

In [27]:
train = train.drop(train[train['MALE_DIV'] + train['MALE_SINGLE'] + train['MALE_MAR_or_WID'] == 3].index)
train

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1
5,1,17,2,0,0,0,1,0.0,0,742.0,...,31,0,1.0,1,1,2.0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,3,13,2,0,0,0,1,0.0,0,1409.0,...,64,0,0.0,1,1,2.0,1,0,0,1
2989,3,15,1,0,0,0,1,0.0,0,1569.0,...,34,1,0.0,1,1,1.0,2,0,0,1
2990,0,18,2,0,0,0,1,0.0,0,1936.0,...,23,0,1.0,0,2,1.0,1,0,0,1
2992,3,12,2,0,0,0,1,0.0,0,804.0,...,38,0,0.0,1,1,2.0,1,0,0,1


Combinations for score of 2 : 
1. Single + Divorced 
2. Single + Married / Widowed
3. Divorced + Married / widowed

Person might have misunderstood 'Single' if he indicated 1 for both Divorced and Single variable. Hence I might have to change the input for Single to 0 for these instances

In [28]:
train[train['MALE_DIV'] + train['MALE_SINGLE'] ==2 ]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
40,3,14,4,0,0,0,0,0.0,0,6603.0,...,23,0,0.0,1,1,2.0,1,1,0,1
77,0,22,3,0,0,0,0,0.0,0,4256.0,...,44,0,0.0,1,1,1.0,1,0,0,1
156,0,9,4,1,0,0,0,0.0,0,2866.0,...,55,1,0.0,0,1,2.0,2,1,0,1
192,3,27,4,0,0,1,0,0.0,0,7115.0,...,32,0,0.0,1,1,2.0,1,1,0,1
229,0,25,2,0,0,0,0,0.0,0,2723.0,...,31,0,0.0,1,1,2.0,1,1,0,1
270,2,16,3,0,0,0,0,0.0,0,11717.0,...,34,0,1.0,0,1,1.0,1,0,0,0
298,1,37,2,1,1,0,1,0.0,0,6649.0,...,38,0,0.0,0,1,2.0,1,0,0,0
299,1,10,4,0,0,0,0,0.0,0,1563.0,...,23,1,0.0,1,1,1.0,1,0,0,1
322,2,23,2,1,0,1,0,0.0,0,5080.0,...,34,0,0.0,1,1,1.0,2,1,0,1
338,0,20,2,0,0,1,0,0.0,0,557.0,...,34,0,0.0,1,1,3.0,1,0,0,1


In [29]:
test[test['MALE_DIV'] + test['MALE_SINGLE'] == 2]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,PROP_UNKN_NONE,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN
142,1,8,2,0,0,0,0,0,0,3549,...,0,41,1,1,1,1,2,1,0,0
388,3,10,2,0,0,0,1,0,0,2366,...,0,40,0,0,1,1,2,1,0,0
451,0,40,4,0,0,0,1,0,0,2617,...,0,36,0,0,1,1,2,1,1,0
489,3,5,0,0,0,1,0,0,0,5850,...,0,37,0,0,1,1,1,1,0,0
508,3,12,2,0,0,1,0,0,0,4348,...,0,43,0,1,0,1,2,2,0,0
682,1,8,2,0,0,0,0,0,0,1439,...,0,42,0,0,1,2,2,1,0,0
686,3,50,4,0,0,0,0,0,0,2095,...,0,35,0,0,1,1,1,2,0,1
850,1,21,4,0,0,0,0,0,0,3065,...,1,41,0,0,1,1,2,1,1,0
878,3,23,4,0,0,0,0,0,0,676,...,0,33,1,0,1,1,1,2,0,0
937,1,13,3,0,0,1,0,0,0,5816,...,0,47,1,0,1,2,1,1,0,0


But there are 11 such isntances in test set. Don't drop or change.

In [30]:
for index in train[train['MALE_DIV'] + train['MALE_SINGLE'] ==2 ].index :
    train.loc[index,'MALE_SINGLE'] = 0

Single + Married / Widowed :
- Likely scenario is similar to that of Single + Divorce, where a widowed person misunderstood single. Hence I make the same changes as before

In [31]:
train[train['MALE_MAR_or_WID'] + train['MALE_SINGLE'] ==2 ]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
135,0,18,4,0,0,1,0,0.0,0,9626.0,...,25,1,0.0,0,2,2.0,2,0,0,1
142,0,43,3,1,0,0,0,0.0,0,16554.0,...,44,0,0.0,1,1,2.0,1,1,0,0
340,3,7,3,0,0,0,0,0.0,0,2338.0,...,63,0,0.0,1,1,2.0,1,1,0,1
357,3,10,2,1,0,0,0,0.0,0,2448.0,...,33,0,0.0,1,1,2.0,1,1,0,1
378,0,5,2,0,0,1,0,0.0,0,1329.0,...,35,0,1.0,0,1,2.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2087,0,38,1,0,0,1,1,0.0,0,5791.0,...,37,0,1.0,0,1,1.0,1,0,0,1
2158,0,21,2,0,0,1,0,0.0,0,6428.0,...,33,0,0.0,1,1,1.0,1,0,0,0
2213,1,5,3,0,0,0,0,0.0,0,3294.0,...,28,0,1.0,1,1,2.0,2,0,0,1
2242,0,13,4,0,0,1,0,0.0,0,10027.0,...,23,0,0.0,0,2,1.0,1,1,0,1


In [32]:
test[test['MALE_MAR_or_WID'] + test['MALE_SINGLE'] == 2]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,PROP_UNKN_NONE,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN
25,0,41,0,0,0,0,0,0,0,9516,...,0,31,0,0,0,1,2,1,1,0
146,1,20,4,1,0,0,0,0,0,1212,...,0,60,0,0,1,1,1,1,0,0
149,3,56,4,0,0,0,0,0,0,2312,...,0,39,0,0,0,1,3,2,0,0
157,3,7,4,0,0,0,1,0,0,6795,...,0,44,0,0,1,1,1,1,0,0
277,3,23,4,1,0,0,0,0,0,426,...,0,23,0,0,1,1,2,1,0,1
423,3,54,2,0,0,0,0,0,0,2931,...,0,40,0,0,1,2,2,1,1,0
471,3,16,4,0,0,0,0,0,1,846,...,0,43,0,0,1,1,1,2,0,0
685,3,16,2,0,0,0,0,0,0,1566,...,0,32,1,0,0,1,1,1,0,0
692,0,27,2,0,0,0,1,0,0,892,...,0,40,0,0,1,1,2,1,0,0
709,0,12,2,0,0,0,0,0,0,6447,...,0,25,0,0,1,1,2,1,0,0


Again, 15 such instances in the dataset. Don't drop or change

In [33]:
for index in train[train['MALE_MAR_or_WID'] + train['MALE_SINGLE'] ==2].index :
    train.loc[index,'MALE_SINGLE'] = 0

Lastly, 'Widowed' vs 'Divorced' is hard to ascertain so I remove these rows.

In [34]:
train[train['MALE_DIV'] + train['MALE_MAR_or_WID'] ==2]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
102,3,18,2,0,0,1,0,0.0,0,1011.0,...,21,0,0.0,0,1,2.0,1,0,0,1
791,3,17,1,1,0,1,1,0.0,0,1564.0,...,21,1,1.0,1,1,2.0,1,1,0,1
1239,3,32,2,0,0,0,0,0.0,0,3528.0,...,43,0,1.0,0,1,1.0,1,0,0,0
1539,3,12,2,0,0,0,1,0.0,0,1564.0,...,40,1,0.0,1,1,2.0,1,0,0,1
1669,3,23,2,0,0,0,0,0.0,0,1866.0,...,23,0,1.0,0,1,2.0,1,1,0,1
2080,3,14,4,0,0,1,0,0.0,0,3019.0,...,45,0,0.0,1,1,1.0,1,1,0,1
2261,0,15,3,0,0,0,0,0.0,0,984.0,...,43,0,0.0,1,1,2.0,1,0,0,1


In [35]:
test[test['MALE_DIV'] + test['MALE_MAR_or_WID'] == 2]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,PROP_UNKN_NONE,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN
117,3,11,4,0,0,0,1,0,0,1991,...,0,41,0,0,1,1,2,1,1,0


1 such instance in test set. Hence should be safe to drop.

In [36]:
train = train.drop(train[train['MALE_DIV'] + train['MALE_MAR_or_WID'] ==2].index)
train

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
4,0,16,2,1,0,0,0,0.0,0,5908.0,...,34,0,1.0,1,1,2.0,2,1,0,1
5,1,17,2,0,0,0,1,0.0,0,742.0,...,31,0,1.0,1,1,2.0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,3,13,2,0,0,0,1,0.0,0,1409.0,...,64,0,0.0,1,1,2.0,1,0,0,1
2989,3,15,1,0,0,0,1,0.0,0,1569.0,...,34,1,0.0,1,1,1.0,2,0,0,1
2990,0,18,2,0,0,0,1,0.0,0,1936.0,...,23,0,1.0,0,2,1.0,1,0,0,1
2992,3,12,2,0,0,0,1,0.0,0,804.0,...,38,0,0.0,1,1,2.0,1,0,0,1


There are also 2 columns regarding the applicant's home : Whether he/she rents and whether he/she owns a residence.

Hence the only combination which would not make sense if the applicant replies yes to both

In [37]:
test[test['RENT'] + test['OWN_RES'] == 2]

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,PROP_UNKN_NONE,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN
77,1,41,2,0,0,1,1,0,0,5141,...,0,22,0,1,1,1,3,1,1,0
132,3,14,2,0,0,1,0,0,0,1393,...,1,36,0,1,1,1,1,1,1,0
142,1,8,2,0,0,0,0,0,0,3549,...,0,41,1,1,1,1,2,1,0,0
268,3,12,2,0,0,0,1,0,0,3108,...,0,25,0,1,1,1,2,1,1,0
308,0,6,2,0,1,1,1,0,0,1703,...,0,44,0,1,1,1,1,1,1,0
339,0,23,2,0,0,0,1,0,0,2401,...,0,26,0,1,1,1,2,1,0,0
383,3,24,2,1,0,0,1,0,1,6824,...,0,42,1,1,1,1,1,1,0,0
406,1,9,2,1,0,0,0,0,0,2975,...,0,44,0,1,1,1,1,1,0,0
418,0,23,2,0,0,1,0,0,0,1031,...,0,22,0,1,1,1,1,1,0,0
436,1,25,2,0,0,1,1,0,0,3958,...,1,29,0,1,1,1,3,1,1,0


But there are 24 instances in the test set. Don't drop

In [38]:
train = train[train['RENT'] + train['OWN_RES'] != 2]
train = train.reset_index(drop=True)
train

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
0,3,16,2,0,0,0,0,0.0,1,3226.0,...,41,0,0.0,1,1,2.0,1,0,0,1
1,1,13,4,0,0,0,0,0.0,0,3158.0,...,42,0,0.0,0,1,2.0,1,1,0,1
2,1,16,2,1,0,0,0,0.0,0,1269.0,...,23,1,0.0,1,1,2.0,1,0,0,1
3,3,9,4,0,0,0,0,0.0,0,3801.0,...,37,0,0.0,1,1,2.0,1,0,0,1
4,3,6,4,0,0,0,1,0.0,0,4084.0,...,38,0,0.0,1,1,2.0,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,3,13,2,0,0,0,1,0.0,0,1409.0,...,64,0,0.0,1,1,2.0,1,0,0,1
2684,3,15,1,0,0,0,1,0.0,0,1569.0,...,34,1,0.0,1,1,1.0,2,0,0,1
2685,0,18,2,0,0,0,1,0.0,0,1936.0,...,23,0,1.0,0,2,1.0,1,0,0,1
2686,3,12,2,0,0,0,1,0.0,0,804.0,...,38,0,0.0,1,1,2.0,1,0,0,1


# Split into training and validation data

In [39]:
X = train.iloc[:,0:30]
y = train.iloc[:,30]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [41]:
len(X_train), len(X_test), len(y_train), len(y_test)

(2150, 538, 2150, 538)

# Logistic Regression

In [42]:
all_predictors = "+".join(X_train.columns)
print(all_predictors)

CHK_ACCT+DURATION+HISTORY+NEW_CAR+USED_CAR+FURNITURE+RADIO_TV+EDUCATION+RETRAINING+AMOUNT+SAV_ACCT+EMPLOYMENT+INSTALL_RATE+MALE_DIV+MALE_SINGLE+MALE_MAR_or_WID+CO_APPLICANT+GUARANTOR+PRESENT_RESIDENT+REAL_ESTATE+PROP_UNKN_NONE+AGE+OTHER_INSTALL+RENT+OWN_RES+NUM_CREDITS+JOB+NUM_DEPENDENTS+TELEPHONE+FOREIGN


In [113]:
pd.concat([X_train, y_train], axis = 1)

Unnamed: 0,CHK_ACCT,DURATION,HISTORY,NEW_CAR,USED_CAR,FURNITURE,RADIO_TV,EDUCATION,RETRAINING,AMOUNT,...,AGE,OTHER_INSTALL,RENT,OWN_RES,NUM_CREDITS,JOB,NUM_DEPENDENTS,TELEPHONE,FOREIGN,RESPONSE
610,3,18,3,0,0,1,0,1.0,0,2043.0,...,34,0,0.0,1,1,1.0,1,1,0,1
2004,0,24,2,0,0,1,0,0.0,0,4020.0,...,27,1,0.0,1,1,2.0,1,0,0,1
1406,1,5,4,1,0,0,0,0.0,0,1425.0,...,35,0,0.0,0,1,1.0,2,0,0,0
1213,3,10,4,1,0,0,1,0.0,0,450.0,...,46,0,0.0,1,1,2.0,2,0,0,1
57,3,16,0,1,0,0,0,0.0,0,985.0,...,27,0,0.0,1,1,2.0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,2,11,2,1,0,0,0,0.0,0,587.0,...,43,0,0.0,0,1,1.0,1,0,0,1
1095,3,23,3,0,0,0,1,0.0,0,1100.0,...,22,0,0.0,1,1,1.0,1,0,0,1
1130,1,7,2,0,0,0,0,0.0,0,3990.0,...,25,0,0.0,1,1,1.0,1,1,0,0
1294,3,9,4,1,0,0,0,0.0,0,1047.0,...,31,0,0.0,1,2,2.0,2,0,0,1


In [43]:
f = 'RESPONSE ~ '+all_predictors
train_data = pd.concat([X_train, y_train], axis = 1)
logitfit = smf.logit(formula = f, data = train_data).fit()
print(logitfit.summary())

Optimization terminated successfully.
         Current function value: 0.543203
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2119
Method:                           MLE   Df Model:                           30
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1223
Time:                        09:33:49   Log-Likelihood:                -1167.9
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 2.326e-51
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.6655      0.399     -1.670      0.095      -1.447       0.116
CHK_ACCT   

## Removing high p value variables

In [44]:
droplist = ['RESPONSE']

def remove_name_formula(toremove):
    print(type(toremove))
    global droplist
    droplist = droplist+toremove
    newdf = train_data.drop(droplist,axis=1)
    predictors = "+".join(newdf.columns)
    f = 'RESPONSE ~ '+predictors
    return f

In [45]:
newf = remove_name_formula(['NEW_CAR'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543212
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2120
Method:                           MLE   Df Model:                           29
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1223
Time:                        09:34:04   Log-Likelihood:                -1167.9
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 6.986e-52
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.6775      0.394     -1.720      0.086      -1.450       0.

In [46]:
newf = remove_name_formula(['FURNITURE'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543213
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2121
Method:                           MLE   Df Model:                           28
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1223
Time:                        09:34:13   Log-Likelihood:                -1167.9
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 2.024e-52
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.6759      0.389     -1.737      0.082      -1.438       0.

In [47]:
newf = remove_name_formula(['EDUCATION'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543223
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2122
Method:                           MLE   Df Model:                           27
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1223
Time:                        09:34:25   Log-Likelihood:                -1167.9
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 5.878e-53
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.6754      0.389     -1.736      0.083      -1.438       0.

In [48]:
newf = remove_name_formula(['RENT'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543241
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2123
Method:                           MLE   Df Model:                           26
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1223
Time:                        09:34:40   Log-Likelihood:                -1168.0
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 1.701e-53
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.7170      0.360     -1.992      0.046      -1.423      -0.

In [49]:
newf = remove_name_formula(['CO_APPLICANT'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543297
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2124
Method:                           MLE   Df Model:                           25
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1222
Time:                        09:34:59   Log-Likelihood:                -1168.1
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 5.203e-54
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.7302      0.359     -2.034      0.042      -1.434      -0.

In [50]:
newf = remove_name_formula(['MALE_DIV'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543407
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2125
Method:                           MLE   Df Model:                           24
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1220
Time:                        09:35:10   Log-Likelihood:                -1168.3
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 1.738e-54
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.7134      0.358     -1.992      0.046      -1.415      -0.

In [51]:
newf = remove_name_formula(['NUM_CREDITS'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543562
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2126
Method:                           MLE   Df Model:                           23
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1217
Time:                        09:35:17   Log-Likelihood:                -1168.7
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 6.231e-55
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.6551      0.351     -1.868      0.062      -1.343       0.

In [52]:
newf = remove_name_formula(['MALE_MAR_or_WID'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543748
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2127
Method:                           MLE   Df Model:                           22
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1214
Time:                        09:35:31   Log-Likelihood:                -1169.1
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 2.329e-55
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.6134      0.347     -1.765      0.078      -1.294       0.

In [53]:
newf = remove_name_formula(['NUM_DEPENDENTS'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.543986
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2128
Method:                           MLE   Df Model:                           21
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1211
Time:                        09:35:41   Log-Likelihood:                -1169.6
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 9.470e-56
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.7635      0.315     -2.426      0.015      -1.380      -0.

In [54]:
newf = remove_name_formula(['PRESENT_RESIDENT'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.544342
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2129
Method:                           MLE   Df Model:                           20
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1205
Time:                        09:36:03   Log-Likelihood:                -1170.3
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 4.783e-56
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -0.9123      0.291     -3.137      0.002      -1.482      -0.342
CH

In [55]:
newf = remove_name_formula(['JOB'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.544708
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2130
Method:                           MLE   Df Model:                           19
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1199
Time:                        09:36:08   Log-Likelihood:                -1171.1
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 2.412e-56
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -1.0638      0.265     -4.020      0.000      -1.582      -0.545
CH

In [56]:
newf = remove_name_formula(['TELEPHONE'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.545022
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2131
Method:                           MLE   Df Model:                           18
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1194
Time:                        09:36:15   Log-Likelihood:                -1171.8
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 1.069e-56
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -1.0672      0.265     -4.034      0.000      -1.586      -0.549
CH

In [57]:
newf = remove_name_formula(['FOREIGN'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.545467
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2132
Method:                           MLE   Df Model:                           17
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1187
Time:                        09:36:21   Log-Likelihood:                -1172.8
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 6.048e-57
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -1.0281      0.263     -3.911      0.000      -1.543      -0.513
CH

In [58]:
newf = remove_name_formula(['RETRAINING'])
logitfit = smf.logit(formula = newf, data = train_data).fit()
print(logitfit.summary())

<class 'list'>
Optimization terminated successfully.
         Current function value: 0.545938
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               RESPONSE   No. Observations:                 2150
Model:                          Logit   Df Residuals:                     2133
Method:                           MLE   Df Model:                           16
Date:                Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1179
Time:                        09:36:35   Log-Likelihood:                -1173.8
converged:                       True   LL-Null:                       -1330.7
Covariance Type:            nonrobust   LLR p-value:                 3.515e-57
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -1.0272      0.263     -3.910      0.000      -1.542      -0.512
CH

All remaining variables have p value 0.05 or less.

In [59]:
train_log_preds = np.array(logitfit.predict(X_train) , dtype=float) #predictions on training set
test_log_preds = np.array(logitfit.predict(X_test) , dtype=float) #predicitons on validation set
metrics.roc_auc_score(y_train, train_log_preds), metrics.roc_auc_score(y_test, test_log_preds)

(0.7305978485223769, 0.7156737737463079)

# SMOTE Resampling

In [64]:
from imblearn.over_sampling import SMOTE
smt = SMOTE(random_state = 3101)
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

# Linear Discriminant Analysis

In [60]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [75]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_sm, y_train_sm)

LinearDiscriminantAnalysis()

In [76]:
train_lda_preds = lda.predict_proba(X_train_sm)[:,1]
test_lda_preds = lda.predict_proba(X_test)[:,1]

In [77]:
metrics.roc_auc_score(y_train_sm, train_lda_preds), metrics.roc_auc_score(y_test, test_lda_preds)

(0.8435590594372318, 0.6810852688154688)

# Support Vector Machine

In [65]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [81]:
svm = make_pipeline(StandardScaler(), SVC(kernel = 'linear',probability = True, C = 50))
svm.fit(X_train_sm, y_train_sm)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=50, kernel='linear', probability=True))])

In [82]:
svm_train_preds = svm.predict_proba(X_train_sm)[:,1]
svm_test_preds = svm.predict_proba(X_test)[:,1]

In [83]:
metrics.roc_auc_score(y_train_sm, svm_train_preds), metrics.roc_auc_score(y_test, svm_test_preds)

(0.8434419068446175, 0.6841357715835176)

# XGB

In [69]:
from xgboost import XGBClassifier

In [70]:
clf = XGBClassifier(objective='binary:logistic',
                            eval_metric='auc',
                            max_depth= 10,
                            learning_rate = 0.08,
                            min_child_weight = 10
                            )

clf.fit(X_train_sm, y_train_sm)
y_pred = clf.predict_proba(X_train_sm)[:,1]
y_pred2 = clf.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_train_sm, y_pred), metrics.roc_auc_score(y_test, y_pred2)



(0.9715169353608301, 0.6933841212453798)

# Ensembling the methods

In [84]:
y_log = logitfit.predict(test)
y_lda = lda.predict_proba(test)[:,1]
y_svm = svm.predict_proba(test)[:,1]
y_xgb = clf.predict_proba(test)[:,1]
y_final = (y_log + +y_lda + y_svm + y_xgb) / 4

In [106]:
df = pd.concat([IDs, pd.DataFrame(y_final)], axis = 1)
df.columns = ['ID','RESPONSE']
path = '/Users/jasinchow/Downloads/submission.csv'
pd.DataFrame(df).to_csv(path, index = False)

In [86]:
df

Unnamed: 0,ID,RESPONSE
0,3001,0.354092
1,3002,0.705087
2,3003,0.927445
3,3004,0.215141
4,3005,0.197134
...,...,...
995,3996,0.207064
996,3997,0.574652
997,3998,0.135220
998,3999,0.211643


In [87]:
y_log_csv = np.array(logitfit.predict(test) > 0.5, dtype = float)
y_lda_csv = lda.predict(test)
y_svm_csv = svm.predict(test)
y_xgb_csv = clf.predict(test)

In [93]:
y_final_csv = np.array(((y_log_csv + y_lda_csv + y_svm_csv + y_xgb_csv) / 4) > 0.5, dtype = float)

In [103]:
df_csv = pd.concat([IDs,pd.DataFrame(y_final_csv)], axis = 1)
df_csv.columns = ['ID','RESPONSE']
df_csv

Unnamed: 0,ID,RESPONSE
0,3001,0.0
1,3002,1.0
2,3003,1.0
3,3004,0.0
4,3005,0.0
...,...,...
995,3996,0.0
996,3997,0.0
997,3998,0.0
998,3999,0.0


In [105]:
path2 = '/Users/jasinchow/Downloads/response.csv'
df_csv.to_csv(path2, index = False)