In [14]:
import pandas as pd
import numpy as np
import eda
import visualization as viz
import feature_engineering as fe
import process_data as pr
import modeling as md

from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
import mutual_info as mi

import matplotlib.pyplot as plt
import seaborn as sns

model_number = 'catboost_ultimate_combo_v3'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [15]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.sample(3)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
531,531,A,195.03,material_7,material_8,9,5,1,4,5,17.378,12.36,18.371,15.146,11.651,18.824,8.962,16.327,22.469,12.079,17.934,16.161,12.096,16.081,703.683,0
20073,20073,D,141.48,material_7,material_5,6,6,13,8,5,16.649,10.788,16.835,18.468,12.169,19.11,9.947,,18.659,11.657,15.992,,15.322,15.734,769.919,0
26309,26309,E,174.23,material_7,material_6,6,9,13,6,7,16.859,12.619,17.04,17.666,11.393,19.889,10.567,15.761,19.349,9.664,14.789,,16.021,12.679,804.912,0


In [16]:
# Drop the id columns
dropped_columns = ['id', 'product_code']

df_train_model = df_train.drop(dropped_columns, axis=1)
df_test_model = df_test.drop(dropped_columns, axis=1)

# Assign train and test for X and y
X_train = df_train_model.drop('failure', axis=1)
y_train = df_train_model['failure']

X_test = df_test_model.copy()

In [17]:
cat_features = ['attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']
continous_features = [cat for cat in X_train.columns if cat not in cat_features]

In [18]:
si = SimpleImputer(strategy='mean')
# Apply preprocessing to the training data
X_train_transformed = si.fit_transform(X_train[continous_features])
X_train_transformed = pd.DataFrame(X_train_transformed, columns=X_train[continous_features].columns)
X_train = pd.concat([X_train[cat_features], X_train_transformed], axis=1)

X_test_transformed = si.transform(X_test[continous_features])
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X_test[continous_features].columns)
X_test = pd.concat([X_test[cat_features], X_test_transformed], axis=1)

X_train.head()

Unnamed: 0,attribute_0,attribute_1,attribute_2,attribute_3,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,material_7,material_8,9,5,80.1,7.0,8.0,4.0,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,16.048444,13.034,14.684,764.1
1,material_7,material_8,9,5,84.89,14.0,3.0,3.0,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057
2,material_7,material_8,9,5,82.43,12.0,1.0,5.0,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,19.172085,13.798,16.711,18.631,14.094,17.946,663.376
3,material_7,material_8,9,5,101.07,13.0,2.0,6.0,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282
4,material_7,material_8,9,5,188.06,9.0,2.0,8.0,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885


In [19]:
target = 'failure'

combined_df = pd.concat([X_train, X_test])
combined_df.sample(2)

Unnamed: 0,attribute_0,attribute_1,attribute_2,attribute_3,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
4857,material_5,material_6,6,4,142.04,12.0,6.0,10.0,17.206,12.664,17.308,17.644,11.328,19.742,12.522,16.117711,16.258,13.433,14.15,16.048444,14.486,18.885,701.269059
15369,material_7,material_8,5,8,84.32,0.0,9.0,3.0,19.149,11.15,17.875,18.159,12.537,18.98,12.098,15.659,18.638,11.184,14.24,17.58,16.193,13.8,799.323


In [20]:
X_train['loading_measurement_6'] = X_train['loading'] * X_train['measurement_6']
X_train['loading_measurement_11_ratio'] = X_train['loading'] / np.where(X_train['measurement_11'] == 0, 1e-6, X_train['measurement_11'])

X_test['loading_measurement_6'] = X_test['loading'] * X_test['measurement_6']
X_test['loading_measurement_11_ratio'] = X_test['loading'] / np.where(X_test['measurement_11'] == 0, 1e-6, X_test['measurement_11'])

In [21]:
numerical_features = X_train.select_dtypes(include=['int', 'float']).columns

In [22]:
X_test.sample(5)

Unnamed: 0,attribute_0,attribute_1,attribute_2,attribute_3,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,loading_measurement_6,loading_measurement_11_ratio
307,material_5,material_6,6,4,134.68,8.0,8.0,2.0,17.802,14.123,15.718,20.47,10.312,18.434,10.571,14.771,19.172085,10.861,16.967,15.603,14.808,14.185,1009.326,2756.8996,7.024797
3070,material_5,material_6,6,4,127.826233,14.0,19.0,7.0,17.202,10.465,16.618,17.173,11.721,18.851,12.038,15.478,17.409,11.306,14.268,17.548,15.05,16.02,526.412,2195.159891,7.342537
13234,material_7,material_7,7,9,173.68,6.0,5.0,6.0,15.261,12.32,16.275,17.824,11.341,18.13,12.279,16.192,18.809,9.422,17.186,16.658,18.618,13.997,690.245,3095.67232,9.233877
19370,material_7,material_5,9,5,161.88,5.0,8.0,8.0,17.816,12.262,16.812,17.376,11.034,17.915,12.461,16.61,18.981,11.254,19.131,19.174,16.052,16.460727,701.269059,2812.82688,8.528529
15746,material_7,material_5,9,5,121.98,3.0,11.0,4.0,18.058,12.627,17.517,17.35,11.59,19.58,11.802,16.249,20.192,10.953,17.769,16.048444,14.57,18.907,772.709,2116.353,6.041006


In [23]:
X_train.shape

(26570, 25)

In [None]:
viz.plot_catboost_feature_importance(X_train, y_train, cat_features=cat_features, figsize=(32, 32))

In [None]:
# Create a blank DataFrame to store the results
result_df = pd.DataFrame({'Feature': X_train.columns})

In [None]:
# Calculate the mutual information scores 5 times
for i in range(5):
    mi_scores = fe.make_mi_scores_classification(X_train[numerical_features], y_train)
    result_df[f'Run_{i + 1}'] = mi_scores['MI Score']
    print(result_df.head(5))

In [None]:
result_df['Average'] = result_df.iloc[:, 1:].mean(axis=1)
result_df = result_df.sort_values(by='Average', ascending=False)
result_df.to_csv(f'mi_result{model_number}.csv', index=False)
result_df

In [None]:
plt.figure(figsize=(32, 32))
sns.barplot(
    x=result_df['Average'],
    y=result_df['Feature'],
)

plt.title('Average MI Score')
plt.show()

In [None]:
fi_df = pd.read_csv('feature_importance.csv')

# Merge the DataFrames on col_a
merged_df = fi_df.merge(result_df, on='Feature')

# Print the merged DataFrame
merged_df = merged_df[['Feature', 'Average_x', 'Average_y']]

merged_df['FI_MI'] = merged_df.iloc[:, 1:].mean(axis=1, skipna=True, numeric_only=True)
merged_df = merged_df.sort_values(by='FI_MI', ascending=False)
merged_df

In [None]:
# more_dropped_columns = ['attribute_2', 'measurement_2', 'measurement_7', 'measurement_12', 'measurement_16']
more_dropped_columns = []

# model_number = 'catboost_simpleimputer_domains'

In [None]:
feature_list = merged_df.iloc[:25, :]['Feature'].tolist()
feature_list

In [24]:
# Initialize CatBoost model
# model = CatBoostClassifier(random_state=5, cat_features=['attribute_0', 'attribute_1','attribute_2', 'attribute_3'], verbose=False)
model = CatBoostClassifier(random_state=5, verbose=False)

In [None]:
rfecv = RFECV(estimator=model, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

rfecv.fit(X_train_domain[feature_list], y_train)

In [None]:
# Get the RFECV ranking of all the features to select which ones to use
# Rank 1 are the features best suited for modelling but rank 2 features aren't bad
# And so on
rfecv_features = pd.DataFrame({'Feature': list(X_test_domain[feature_list].columns),
                               'Ranking': rfecv.ranking_})

rfecv_features.sort_values(by='Ranking', inplace=True)

rfecv_features

In [None]:
# Get the index of Rank 1 features
selected_features = [i for i, selected in enumerate(rfecv.support_) if selected]

In [None]:
# Get a list of the rank 1 features
rfecv_rank_1 = list(X_train_domain[feature_list].columns[selected_features])
rfecv_rank_1

In [25]:
rfecv_rank_1 = ['loading_measurement_11_ratio', 
                'loading_measurement_6',
                'loading']

In [None]:
num_folds = 5
cv = StratifiedKFold(n_splits=num_folds, random_state=5, shuffle=True)

# scores = cross_val_score(model, X_train.drop(dropped_columns_low_fi, axis=1), y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
# scores = cross_val_score(model, X_train.drop(more_dropped_columns, axis=1), y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
scores = cross_val_score(model, X_train[rfecv_rank_1], y_train, cv=cv, n_jobs=-1, scoring='roc_auc')

list(scores), scores.mean(), scores.std()

In [None]:
# Fit the model on the entire training data
# model.fit(X_train.drop(dropped_columns_low_fi, axis=1), y_train)
model.fit(X_train[rfecv_rank_1], y_train)

# Predict on the X_test data
# predictions = model.predict_proba(X_test.drop(dropped_columns_low_fi, axis=1))[:, 1]
predictions = model.predict_proba(X_test[rfecv_rank_1])[:, 1]

# Create a DataFrame for submission
submission_df = pd.DataFrame(
    {'id': df_test['id'],
    'failure': predictions})

# submission_df.head(50)
# Save to CSV for submission
submission_df.to_csv(f'submission_{model_number}.csv', index=False)