# Decision tree

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from utils import filepaths

In [2]:
df = pd.read_csv(filepaths.PATH_TO_LABELED_CSV)
df.columns

Index(['Unnamed: 0', 'coil', 'furnace Number', 'analyse', 'Hardness_1',
       'Hardness_2', 'Width', 'Temperature before finishing mill',
       'Temperature after finishing mill', 'Thickness', 'Thickness profile',
       'c', 'mn', 'si', 'nb', 'p', 's', 'al', 'ma', 'b', 'n', 'ti', 'cr', 'va',
       'mo', 'Constriction?', 'Max constriction', 'Number of constrictions'],
      dtype='object')

In [53]:
df_predict = df.drop(["Unnamed: 0", "coil", "Max constriction", "Number of constrictions", "furnace Number", "Width", "Temperature before finishing mill", "Temperature after finishing mill", "Thickness profile", "p", "n", "ti", "analyse"], axis=1)
df_predict.head()

Unnamed: 0,Hardness_1,Hardness_2,Thickness,c,mn,si,nb,s,al,ma,b,cr,va,mo,Constriction?
0,10003,101,4.36,355,2162,49,0,143,304,291,1,302,0,25,False
1,10123,101,4.37,551,1985,101,0,90,395,384,1,189,25,7,False
2,10040,102,4.43,457,1895,60,0,115,476,463,1,288,0,40,False
3,10243,102,4.44,697,2008,69,0,98,306,296,1,253,0,9,False
4,10012,100,3.95,477,1936,52,0,121,340,329,1,297,0,23,False


In [4]:
for column in df_predict[['Hardness_1', 'Hardness_2', 'Width',
       'Temperature before finishing mill', 'Temperature after finishing mill',
       'Thickness', 'Thickness profile', 'c', 'mn', 'si', 'nb', 'p', 's', 'al',
       'ma', 'b', 'n', 'ti', 'cr', 'va', 'mo']]:
    df_predict[column] = pd.to_numeric(df_predict[column], errors="coerce")

In [54]:
df_predict.dtypes

Hardness_1         int64
Hardness_2         int64
Thickness        float64
c                  int64
mn                 int64
si                 int64
nb                 int64
s                  int64
al                 int64
ma                 int64
b                  int64
cr                 int64
va                 int64
mo                 int64
Constriction?       bool
dtype: object

In [55]:
print(df_predict.isna().sum())

Hardness_1       0
Hardness_2       0
Thickness        0
c                0
mn               0
si               0
nb               0
s                0
al               0
ma               0
b                0
cr               0
va               0
mo               0
Constriction?    0
dtype: int64


In [6]:
df_predict.dropna(inplace=True)

In [7]:
print(df_predict.isna().sum())

furnace Number                       0
analyse                              0
Hardness_1                           0
Hardness_2                           0
Width                                0
Temperature before finishing mill    0
Temperature after finishing mill     0
Thickness                            0
Thickness profile                    0
c                                    0
mn                                   0
si                                   0
nb                                   0
p                                    0
s                                    0
al                                   0
ma                                   0
b                                    0
n                                    0
ti                                   0
cr                                   0
va                                   0
mo                                   0
Constriction?                        0
dtype: int64


In [56]:
X = df_predict.drop(["Constriction?"], axis=1)
y = df_predict["Constriction?"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [57]:
#scale the features
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [63]:
clf = tree.DecisionTreeClassifier(class_weight="balanced")
clf = clf.fit(X_train, y_train)

In [9]:
##tree.plot_tree(clf)

In [64]:
clf.score(X_train, y_train)

0.9933398227368205

In [65]:
clf.score(X_test, y_test)

0.8550594704440858

In [66]:
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix(y_pred=y_pred, y_true=y_test)

pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,13053,1232,14285
True,1193,1253,2446
All,14246,2485,16731


In [67]:
df_predict.corr()["Constriction?"]

Hardness_1       0.220486
Hardness_2       0.172304
Thickness       -0.186062
c                0.236152
mn               0.345045
si               0.302259
nb              -0.121030
s               -0.203288
al               0.142911
ma               0.144692
b                0.256514
cr               0.319859
va               0.249968
mo               0.256112
Constriction?    1.000000
Name: Constriction?, dtype: float64