In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/CoilData.csv')
coils = pd.read_csv('./output.csv')
coil_list = list(map(int,list(coils.columns)))
lst = []
for i in data['coil']:
    if i in coil_list:
        lst.append(1)
    else:
        lst.append(0)
data['contracted'] = lst
df = data

'''
Check how many times it happens that the coils are broken right after each other



'''
# df = pd.get_dummies(data, columns=['contracted'])

'\nCheck how many times it happens that the coils are broken right after each other\n\n\n\n'

In [3]:
# plt.figure(figsize=(20, 10))
# corr = df.corr()
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True
# heatmap = sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, annot=True)
# heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);
# plt.show()

In [4]:
'''
Fit a model for every type of metal --- 'analyse' column ---'''

"\nFit a model for every type of metal --- 'analyse' column ---"

## Data selection and partitioning

In [5]:
df = df.drop(columns=['coil','analyse'])

In [6]:
df['Thickness profile'] = df['Thickness profile'].apply(lambda x: x.replace('*******', ''))
df = df.replace('', np.nan, regex=True).dropna().astype(float)
df.head().T

Unnamed: 0,0,1,2,3,4
furnace Number,1.0,3.0,4.0,3.0,4.0
Hardness_1,10003.0,10123.0,10040.0,10243.0,10012.0
Hardness_2,101.0,101.0,102.0,102.0,100.0
Width,1302.1,1282.3,1297.4,1295.2,1293.3
Temperature before finishing mill,1147.0,1150.0,1183.0,1165.0,1192.0
Temperature after finishing mill,921.0,920.0,933.0,910.0,909.0
Thickness,4.36,4.37,4.43,4.44,3.95
Thickness profile,31.0,35.0,25.0,28.0,26.0
c,355.0,551.0,457.0,697.0,477.0
mn,2162.0,1985.0,1895.0,2008.0,1936.0


In [7]:
# making random samples for train and test sets, because the output.csv is ordered from highest 
# absolute difference to less ...
X = df.iloc[:,:-1]
y = df.contracted
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

'''
Balanced dataset for more realistic scores ,even 0 and 1 in train and test sets'''



## Baseline modeling

In [8]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())
])

# Train model
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

# Make predictions
y_train_pred_proba = pipe.predict_proba(X_train)[:,1]
y_test_pred_proba = pipe.predict_proba(X_test)[:,1]

In [9]:
# Observe general model performance
print('train roc auc: {:.2f}'.format(roc_auc_score(y_train, y_train_pred_proba)))
print('test roc auc:   {:.2f}'.format(roc_auc_score(y_test, y_test_pred_proba)))

train roc auc: 0.85
test roc auc:   0.84


In [10]:
# Observe general model performance
y_train_pred = y_train_pred_proba > 0.5
y_test_pred = y_test_pred_proba > 0.5

print('train f1:      {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('test f1:        {:.2f}'.format(f1_score(y_test, y_test_pred)))

train f1:      0.19
test f1:        0.18
