# Decision tree

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from utils import filepaths

In [2]:
df = pd.read_csv(filepaths.PATH_TO_LABELED_CSV)
df.columns

Index(['Unnamed: 0', 'coil', 'furnace Number', 'analyse', 'Hardness_1',
       'Hardness_2', 'Width', 'Temperature before finishing mill',
       'Temperature after finishing mill', 'Thickness', 'Thickness profile',
       'c', 'mn', 'si', 'nb', 'p', 's', 'al', 'ma', 'b', 'n', 'ti', 'cr', 'va',
       'mo', 'Constriction', 'Max separation', 'Number of separation points'],
      dtype='object')

In [3]:
df["Thickness profile"] = pd.to_numeric(df["Thickness profile"], errors="coerce")

In [4]:
df.corr()["Constriction"]

Unnamed: 0                          -0.012139
coil                                 0.000505
furnace Number                       0.012055
Hardness_1                           0.214507
Hardness_2                           0.165182
Width                               -0.061124
Temperature before finishing mill   -0.003294
Temperature after finishing mill     0.029210
Thickness                           -0.179142
Thickness profile                   -0.015433
c                                    0.241048
mn                                   0.328656
si                                   0.302491
nb                                  -0.128100
p                                   -0.033705
s                                   -0.195306
al                                   0.180140
ma                                   0.181562
b                                    0.244905
n                                    0.088081
ti                                   0.002523
cr                                

In [5]:
df_predict = df.drop(["Unnamed: 0", "coil", "Max separation", "Number of separation points", "furnace Number", "Temperature before finishing mill", "Thickness profile", "ti", "analyse", "Temperature after finishing mill"], axis=1)
df_predict.head()

Unnamed: 0,Hardness_1,Hardness_2,Width,Thickness,c,mn,si,nb,p,s,al,ma,b,n,cr,va,mo,Constriction
0,10003,101,1302.1,4.36,355,2162,49,0,133,143,304,291,1,34,302,0,25,False
1,10123,101,1282.3,4.37,551,1985,101,0,118,90,395,384,1,33,189,25,7,False
2,10040,102,1297.4,4.43,457,1895,60,0,108,115,476,463,1,20,288,0,40,False
3,10243,102,1295.2,4.44,697,2008,69,0,139,98,306,296,1,21,253,0,9,False
4,10012,100,1293.3,3.95,477,1936,52,0,112,121,340,329,1,28,297,0,23,False


In [6]:
df_predict.corr()["Constriction"]

Hardness_1      0.214507
Hardness_2      0.165182
Width          -0.061124
Thickness      -0.179142
c               0.241048
mn              0.328656
si              0.302491
nb             -0.128100
p              -0.033705
s              -0.195306
al              0.180140
ma              0.181562
b               0.244905
n               0.088081
cr              0.298300
va              0.232972
mo              0.209941
Constriction    1.000000
Name: Constriction, dtype: float64

In [7]:
df_predict.dtypes

Hardness_1        int64
Hardness_2        int64
Width           float64
Thickness       float64
c                 int64
mn                int64
si                int64
nb                int64
p                 int64
s                 int64
al                int64
ma                int64
b                 int64
n                 int64
cr                int64
va                int64
mo                int64
Constriction       bool
dtype: object

In [8]:
print(df_predict.isna().sum())

Hardness_1      0
Hardness_2      0
Width           0
Thickness       0
c               0
mn              0
si              0
nb              0
p               0
s               0
al              0
ma              0
b               0
n               0
cr              0
va              0
mo              0
Constriction    0
dtype: int64


In [9]:
df_predict.dropna(inplace=True)

In [10]:
print(df_predict.isna().sum())

Hardness_1      0
Hardness_2      0
Width           0
Thickness       0
c               0
mn              0
si              0
nb              0
p               0
s               0
al              0
ma              0
b               0
n               0
cr              0
va              0
mo              0
Constriction    0
dtype: int64


In [11]:
X = df_predict.drop(["Constriction"], axis=1)
y = df_predict["Constriction"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
#scale the features
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [13]:
clf = tree.DecisionTreeClassifier(class_weight="balanced")
clf = clf.fit(X_train, y_train)

In [14]:
clf.score(X_train, y_train)

0.999846303601619

In [15]:
clf.score(X_test, y_test)

0.8339011415934493

In [16]:
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_pred=y_pred, y_true=y_test)

pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,12685,1378,14063
True,1401,1267,2668
All,14086,2645,16731
