# Train test split to detect overfitting

## Load data

In [2]:
import pandas as pd

path = '../../../data/default_credit_card/output/simplified_features.csv'
df = pd.read_csv(path)
df

Unnamed: 0,Gender,Age,Married,YearsEmployed,Income,Approved
0,1,30,1,1.25,0,1
1,0,58,1,3.04,560,1
...,...,...,...,...,...,...
688,1,17,1,0.04,750,0
689,1,35,1,8.29,0,0


## Feature selection

In [3]:
target = 'Approved'

y = df[target]
X = df.drop(columns=target)

## Modelling will full data

In [4]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X, y)

In [5]:
model.score(X, y)

0.9927536231884058

## Modelling with split data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [7]:
X_train

Unnamed: 0,Gender,Age,Married,YearsEmployed,Income
178,0,18,1,1.210,540
265,1,17,0,0.040,722
...,...,...,...,...,...
435,1,19,0,0.000,1
102,1,18,1,0.375,38


In [8]:
X_test

Unnamed: 0,Gender,Age,Married,YearsEmployed,Income
286,0,28,1,0.00,105
511,0,46,1,0.00,960
...,...,...,...,...,...
674,0,37,1,0.21,246
559,0,22,1,2.29,2384


In [9]:
len(X_train) + len(X_test)

690

In [10]:
X

Unnamed: 0,Gender,Age,Married,YearsEmployed,Income
0,1,30,1,1.25,0
1,0,58,1,3.04,560
...,...,...,...,...,...
688,1,17,1,0.04,750
689,1,35,1,8.29,0


In [11]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [12]:
model.score(X_train, y_train)

0.9958592132505176

In [12]:
model.score(X_test, y_test)

0.6376811594202898

![](src/tree.png)

In [13]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

scale=5
plt.figure(figsize=(20*scale, 10*scale))
plot_tree(model, filled=True, feature_names=X.columns.tolist(), fontsize=8)
plt.savefig('src/tree.png')
plt.show()