In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [3]:
df = pd.read_csv("data/Train_data.csv")
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Class
0,Male,Yes,0,Graduate,No,6000,2250.0,265.0,360.0,,Semiurban,N
1,Male,Yes,0,Graduate,No,2958,2900.0,131.0,360.0,1.0,Semiurban,Y
2,Male,Yes,2,Graduate,No,6250,1695.0,210.0,360.0,1.0,Semiurban,Y
3,Male,Yes,0,Graduate,No,2083,3150.0,128.0,360.0,1.0,Semiurban,Y
4,Male,No,0,Graduate,No,4166,0.0,98.0,360.0,0.0,Semiurban,N


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      479 non-null    object 
 1   X2      490 non-null    object 
 2   X3      480 non-null    object 
 3   X4      491 non-null    object 
 4   X5      467 non-null    object 
 5   X6      491 non-null    int64  
 6   X7      491 non-null    float64
 7   X8      475 non-null    float64
 8   X9      479 non-null    float64
 9   X10     451 non-null    float64
 10  X11     491 non-null    object 
 11  Class   491 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 46.2+ KB


# Data Preprocessing

In [5]:
df1 = df.copy()
df1.isna().sum()

X1       12
X2        1
X3       11
X4        0
X5       24
X6        0
X7        0
X8       16
X9       12
X10      40
X11       0
Class     0
dtype: int64

### Data Imputation

- Categorical Variables

In [6]:
df1['X1'].fillna(df['X1'].mode()[0],inplace=True)
df1['X2'].fillna(df['X2'].mode()[0],inplace=True)
df1['X3'].fillna(df['X3'].mode()[0],inplace=True)
df1['X5'].fillna(df['X5'].mode()[0],inplace=True)
df1['X9'].fillna(df['X9'].mode()[0],inplace=True)
df1['X10'].fillna(df['X10'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['X1'].fillna(df['X1'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['X2'].fillna(df['X2'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

- Numerical Variables 

In [7]:
df1['X8'].fillna(df['X8'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['X8'].fillna(df['X8'].mean(),inplace=True)


In [8]:
df1.isna().sum()

X1       0
X2       0
X3       0
X4       0
X5       0
X6       0
X7       0
X8       0
X9       0
X10      0
X11      0
Class    0
dtype: int64

In [9]:
df2 = df1.copy()
df2 = pd.get_dummies(df2)

df2 = df2.drop(['X1_Female', 'X2_No', 'X4_Not Graduate', 
              'X5_No', 'Class_N'], axis=1)
new = {'X1_Male': 'X1', 'X2_Yes': 'X2', 
       'X4_Graduate': 'X4', 'X5_Yes': 'X5',
       'Class_Y': 'Class'}
df2.rename(columns=new, inplace=True)
df2

Unnamed: 0,X6,X7,X8,X9,X10,X1,X2,X3_0,X3_1,X3_2,X3_3+,X4,X5,X11_Rural,X11_Semiurban,X11_Urban,Class
0,6000,2250.0,265.000000,360.0,1.0,True,True,True,False,False,False,True,False,False,True,False,False
1,2958,2900.0,131.000000,360.0,1.0,True,True,True,False,False,False,True,False,False,True,False,True
2,6250,1695.0,210.000000,360.0,1.0,True,True,False,False,True,False,True,False,False,True,False,True
3,2083,3150.0,128.000000,360.0,1.0,True,True,True,False,False,False,True,False,False,True,False,True
4,4166,0.0,98.000000,360.0,0.0,True,False,True,False,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,1875,1875.0,97.000000,360.0,1.0,True,True,False,False,True,False,False,True,False,True,False,True
487,11417,1126.0,225.000000,360.0,1.0,True,True,False,False,True,False,True,False,False,False,True,True
488,3237,0.0,30.000000,360.0,1.0,False,False,True,False,False,False,True,False,False,False,True,True
489,10047,0.0,148.722105,240.0,1.0,False,True,True,False,False,False,True,False,False,True,False,True


In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   X6             491 non-null    int64  
 1   X7             491 non-null    float64
 2   X8             491 non-null    float64
 3   X9             491 non-null    float64
 4   X10            491 non-null    float64
 5   X1             491 non-null    bool   
 6   X2             491 non-null    bool   
 7   X3_0           491 non-null    bool   
 8   X3_1           491 non-null    bool   
 9   X3_2           491 non-null    bool   
 10  X3_3+          491 non-null    bool   
 11  X4             491 non-null    bool   
 12  X5             491 non-null    bool   
 13  X11_Rural      491 non-null    bool   
 14  X11_Semiurban  491 non-null    bool   
 15  X11_Urban      491 non-null    bool   
 16  Class          491 non-null    bool   
dtypes: bool(12), float64(4), int64(1)
memory usage: 25.1 K

### Remove Outliers

In [11]:
df2.isna().sum()

X6               0
X7               0
X8               0
X9               0
X10              0
X1               0
X2               0
X3_0             0
X3_1             0
X3_2             0
X3_3+            0
X4               0
X5               0
X11_Rural        0
X11_Semiurban    0
X11_Urban        0
Class            0
dtype: int64

In [12]:
df3 = df2.copy()
df3 = df3.map(lambda x: int(x) if isinstance(x, bool) else x)
df3

Unnamed: 0,X6,X7,X8,X9,X10,X1,X2,X3_0,X3_1,X3_2,X3_3+,X4,X5,X11_Rural,X11_Semiurban,X11_Urban,Class
0,6000,2250.0,265.000000,360.0,1.0,1,1,1,0,0,0,1,0,0,1,0,0
1,2958,2900.0,131.000000,360.0,1.0,1,1,1,0,0,0,1,0,0,1,0,1
2,6250,1695.0,210.000000,360.0,1.0,1,1,0,0,1,0,1,0,0,1,0,1
3,2083,3150.0,128.000000,360.0,1.0,1,1,1,0,0,0,1,0,0,1,0,1
4,4166,0.0,98.000000,360.0,0.0,1,0,1,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,1875,1875.0,97.000000,360.0,1.0,1,1,0,0,1,0,0,1,0,1,0,1
487,11417,1126.0,225.000000,360.0,1.0,1,1,0,0,1,0,1,0,0,0,1,1
488,3237,0.0,30.000000,360.0,1.0,0,0,1,0,0,0,1,0,0,0,1,1
489,10047,0.0,148.722105,240.0,1.0,0,1,1,0,0,0,1,0,0,1,0,1


In [13]:
# Remove outliers
'''Q1 = df3.quantile(0.25)
Q3 = df3.quantile(0.75)
IQR = Q3 - Q1

df4 = df3[~((df3 < (Q1 - 1.5 * IQR)) |(df3 > (Q3 + 1.5 * IQR))).any(axis=1)]'''
df4 = df3.copy()

In [14]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   X6             491 non-null    int64  
 1   X7             491 non-null    float64
 2   X8             491 non-null    float64
 3   X9             491 non-null    float64
 4   X10            491 non-null    float64
 5   X1             491 non-null    int64  
 6   X2             491 non-null    int64  
 7   X3_0           491 non-null    int64  
 8   X3_1           491 non-null    int64  
 9   X3_2           491 non-null    int64  
 10  X3_3+          491 non-null    int64  
 11  X4             491 non-null    int64  
 12  X5             491 non-null    int64  
 13  X11_Rural      491 non-null    int64  
 14  X11_Semiurban  491 non-null    int64  
 15  X11_Urban      491 non-null    int64  
 16  Class          491 non-null    int64  
dtypes: float64(4), int64(13)
memory usage: 65.3 KB


In [15]:
X = df4.drop(['Class'], axis=1)
y = df4['Class']

### Data Normalization

In [16]:
X = MinMaxScaler().fit_transform(X)
X

array([[0.07235622, 0.05399957, 0.36310395, ..., 0.        , 1.        ,
        0.        ],
       [0.03473098, 0.06959944, 0.16691069, ..., 0.        , 1.        ,
        0.        ],
       [0.07544836, 0.04067967, 0.28257687, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.03818182, 0.        , 0.01903367, ..., 0.        , 0.        ,
        1.        ],
       [0.12241187, 0.        , 0.19285813, ..., 0.        , 1.        ,
        0.        ],
       [0.16697588, 0.        , 0.19285813, ..., 0.        , 0.        ,
        1.        ]])

### Splitting Data Set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model

In [18]:
RFclassifier = RandomForestClassifier(n_estimators = 1000, random_state = 42, max_leaf_nodes=10)
RFclassifier.fit(X_train, y_train)

In [19]:
'''param_grid = {
    "n_estimators": [50, 100, 200, 500, 1000],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 1, 2, 5],
    "min_samples_leaf": [1, 2, 5, 10]
}
RFclassifier = GridSearchCV(RandomForestClassifier(random_state=246), param_grid=param_grid, scoring="f1", verbose=3, cv=5, n_jobs=8)
RFclassifier.fit(X_train, y_train)'''

'param_grid = {\n    "n_estimators": [50, 100, 200, 500, 1000],\n    "criterion": ["gini", "entropy", "log_loss"],\n    "max_depth": [None, 1, 2, 5],\n    "min_samples_leaf": [1, 2, 5, 10]\n}\nRFclassifier = GridSearchCV(RandomForestClassifier(random_state=246), param_grid=param_grid, scoring="f1", verbose=3, cv=5, n_jobs=8)\nRFclassifier.fit(X_train, y_train)'

In [20]:
'''y_predict = RFclassifier.predict(X_test)
print(RFclassifier.best_estimator_)
print(RFclassifier.best_score_)
print(RFclassifier.best_params_)
print(classification_report(y_test, y_predict))'''

'y_predict = RFclassifier.predict(X_test)\nprint(RFclassifier.best_estimator_)\nprint(RFclassifier.best_score_)\nprint(RFclassifier.best_params_)\nprint(classification_report(y_test, y_predict))'

In [21]:
print(RFclassifier.score(X_test, y_test))

0.797979797979798


# Using model to predict class for Test_samsung_noclass.csv

In [22]:
pickle.dump(RFclassifier, open("models/classifier.pkl", 'wb'))

In [23]:
RFclassifier = pickle.load(open("models/classifier.pkl", 'rb'))

In [24]:
loan_data = pd.read_csv('data/loan_data_set.csv')
loan_data = loan_data.drop('Loan_ID', axis=1)

In [25]:
loan_data1 = loan_data.sample(frac=0.2, random_state=42)
y_test_loan_data = loan_data1['Loan_Status']
temp_df = loan_data1.drop('Loan_Status', axis=1)
temp_df

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11
350,Male,Yes,0,Graduate,No,9083,0.0,228.0,360.0,1.0,Semiurban
377,Male,Yes,0,Graduate,No,4310,0.0,130.0,360.0,,Semiurban
163,Male,Yes,2,Graduate,No,4167,1447.0,158.0,360.0,1.0,Rural
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural
132,Male,No,0,Graduate,No,2718,0.0,70.0,360.0,1.0,Semiurban
...,...,...,...,...,...,...,...,...,...,...,...
231,Male,Yes,0,Graduate,,3716,0.0,42.0,180.0,1.0,Rural
312,Female,No,0,Graduate,No,2507,0.0,56.0,360.0,1.0,Rural
248,Male,Yes,1,Graduate,No,2882,1843.0,123.0,480.0,1.0,Semiurban
11,Male,Yes,2,Graduate,,2500,1840.0,109.0,360.0,1.0,Urban


In [26]:
temp_df['X1'].fillna(temp_df['X1'].mode()[0],inplace=True)
temp_df['X2'].fillna(temp_df['X2'].mode()[0],inplace=True)
temp_df['X3'].fillna(temp_df['X3'].mode()[0],inplace=True)
temp_df['X5'].fillna(temp_df['X5'].mode()[0],inplace=True)
temp_df['X9'].fillna(temp_df['X9'].mode()[0],inplace=True)
temp_df['X10'].fillna(temp_df['X10'].mode()[0],inplace=True)
temp_df['X8'].fillna(temp_df['X8'].mean(),inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  temp_df['X1'].fillna(temp_df['X1'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  temp_df['X2'].fillna(temp_df['X2'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [27]:
temp_df = pd.get_dummies(temp_df)
temp_df = temp_df.drop(['X1_Female', 'X2_No', 'X4_Not Graduate', 
              'X5_No'], axis=1)
new = {'X1_Male': 'X1', 'X2_Yes': 'X2', 
       'X4_Graduate': 'X4', 'X5_Yes': 'X5',
       }
temp_df.rename(columns=new, inplace=True)
temp_df = temp_df.map(lambda x: int(x) if isinstance(x, bool) else x)
temp_df

Unnamed: 0,X6,X7,X8,X9,X10,X1,X2,X3_0,X3_1,X3_2,X3_3+,X4,X5,X11_Rural,X11_Semiurban,X11_Urban
350,9083,0.0,228.0,360.0,1.0,1,1,1,0,0,0,1,0,0,1,0
377,4310,0.0,130.0,360.0,1.0,1,1,1,0,0,0,1,0,0,1,0
163,4167,1447.0,158.0,360.0,1.0,1,1,0,0,1,0,1,0,1,0,0
609,2900,0.0,71.0,360.0,1.0,0,0,1,0,0,0,1,0,1,0,0
132,2718,0.0,70.0,360.0,1.0,1,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,3716,0.0,42.0,180.0,1.0,1,1,1,0,0,0,1,0,1,0,0
312,2507,0.0,56.0,360.0,1.0,0,0,1,0,0,0,1,0,1,0,0
248,2882,1843.0,123.0,480.0,1.0,1,1,0,1,0,0,1,0,0,1,0
11,2500,1840.0,109.0,360.0,1.0,1,1,0,0,1,0,1,0,0,0,1


In [28]:
X_temp = MinMaxScaler().fit_transform(temp_df)
X_temp

array([[0.13419402, 0.        , 0.37055838, ..., 0.        , 1.        ,
        0.        ],
       [0.05802467, 0.        , 0.20473773, ..., 0.        , 1.        ,
        0.        ],
       [0.05574262, 0.18670968, 0.25211506, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.0352361 , 0.23780645, 0.1928934 , ..., 0.        , 1.        ,
        0.        ],
       [0.02914   , 0.23741935, 0.16920474, ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.        , 0.81387479, ..., 0.        , 0.        ,
        1.        ]])

In [29]:
y_temp_pre = RFclassifier.predict(X_temp)

In [30]:
y_test_loan_data

350    Y
377    Y
163    Y
609    Y
132    Y
      ..
231    Y
312    Y
248    Y
11     Y
333    Y
Name: Loan_Status, Length: 123, dtype: object

In [31]:
s = 0
s1 = 0
print("Dự đoán, thực tế")
for i, j in zip(y_temp_pre, y_test_loan_data):
    print(i, j, end='\n')
    if ((i==1 and j=="Y") or (i==0 and j=="N")): 
        s1+=1
    s += 1
print(s1, s, s1/s)

Dự đoán, thực tế
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 N
0 N
0 N
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
0 Y
1 Y
1 Y
1 Y
0 Y
1 Y
1 N
1 Y
1 Y
1 Y
1 Y
1 Y
1 Y
1 N
0 N
1 N
1 Y
1 Y
1 N
1 Y
1 Y
1 N
1 Y
1 Y
1 Y
0 N
0 N
0 N
0 N
1 N
1 N
0 N
1 N
1 Y
1 N
0 N
1 Y
0 N
1 Y
0 Y
1 N
0 N
1 Y
1 Y
1 Y
1 Y
1 Y
1 N
1 N
0 N
1 Y
1 N
1 N
0 N
1 Y
0 Y
0 N
0 N
1 Y
1 N
1 Y
1 N
1 N
1 Y
1 Y
1 Y
0 N
1 Y
1 N
0 N
0 N
0 N
1 Y
1 Y
1 Y
1 Y
1 Y
0 N
1 Y
1 Y
1 Y
1 Y
0 Y
0 Y
0 N
1 Y
0 N
1 Y
1 Y
1 Y
0 N
1 Y
1 Y
1 Y
1 Y
1 Y
1 N
1 Y
1 Y
1 Y
1 Y
1 Y
97 123 0.7886178861788617
