In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
df_predict = pd.read_csv("..\\..\\Documents\\Arcellor_Mittal_project\\CoilData_test.csv")
df_predict

Unnamed: 0.1,Unnamed: 0,coil,furnace Number,analyse,Hardness_1,Hardness_2,Width,Temperature before finishing mill,Temperature after finishing mill,Thickness,...,ma,b,n,ti,cr,va,mo,Constriction,Max separation,Number of separation points
0,0,396378,1,K371,10003,101,1302.1,1147,921,4.36,...,291,1,34,6,302,0,25,False,,
1,1,396376,3,K371,10123,101,1282.3,1150,920,4.37,...,384,1,33,12,189,25,7,False,,
2,2,396377,4,K321,10040,102,1297.4,1183,933,4.43,...,463,1,20,11,288,0,40,False,,
3,3,396379,3,K371,10243,102,1295.2,1165,910,4.44,...,296,1,21,9,253,0,9,False,,
4,4,396380,4,K321,10012,100,1293.3,1192,909,3.95,...,329,1,28,8,297,0,23,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54940,56893,15618,1,K371,9948,99,1257.3,1183,913,3.27,...,339,1,26,7,199,0,30,False,,
54941,56895,15620,1,K371,9951,100,1256.6,1180,922,2.68,...,379,1,24,7,194,0,30,False,,
54942,56896,15621,3,K371,9885,99,1257.2,1182,919,2.68,...,324,1,26,8,248,0,36,False,,
54943,56897,15622,1,K371,9885,99,1257.1,1194,923,2.68,...,324,1,26,8,248,0,36,False,,


In [3]:
df_predict.drop(['analyse','Unnamed: 0', 'coil','Max separation', 'Thickness profile','Hardness_2', 'Number of separation points','furnace Number','b','nb'],inplace = True, axis = 1)

In [4]:
df_predict.dropna(axis = 0, how = 'any', inplace = True)

In [5]:
df_predict = df_predict.loc[df_predict.Hardness_1 * df_predict.Width * df_predict.Thickness * df_predict['Temperature before finishing mill'] * df_predict['Temperature after finishing mill'] != 0]

In [6]:
X = df_predict.drop("Constriction", axis=1)
y = df_predict["Constriction"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
#scale the features
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
# Balancing the data
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0, sampling_strategy = 0.5)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(False, 2080), (True, 1040)]


In [9]:
clf = RandomForestClassifier(criterion = 'gini', n_estimators = 300, random_state = 30, max_depth = 11)
clf.fit(X_resampled, y_resampled)


from sklearn.metrics import confusion_matrix, classification_report

y_pred = clf.predict(X_test)
confusion_matrix(y_pred=y_pred, y_true=y_test)

pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,13656,2345,16001
True,144,339,483
All,13800,2684,16484


In [10]:
print(clf.score(X_resampled, y_resampled))
print(clf.score(X_test, y_test))
print(classification_report(y_test,y_pred))

0.9762820512820513
0.8490050958505218
              precision    recall  f1-score   support

       False       0.99      0.85      0.92     16001
        True       0.13      0.70      0.21       483

    accuracy                           0.85     16484
   macro avg       0.56      0.78      0.57     16484
weighted avg       0.96      0.85      0.90     16484



In [11]:
# Feature importance

feature_importances = pd.DataFrame(clf.feature_importances_, index =X.columns,  columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
Thickness,0.159197
c,0.122834
mn,0.093187
si,0.080905
Width,0.073501
Hardness_1,0.061532
n,0.053669
s,0.048282
ti,0.046109
Temperature after finishing mill,0.041058
