In [1]:
import pandas as pd
import numpy as np
import eda
import visualization as viz
import feature_engineering as fe
import process_data as pr
import modeling as md

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import seaborn as sns

model_number = 'catboost_hyperparameters_gridsearch'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.sample(3)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
19759,19759,D,120.55,material_7,material_5,6,6,3,10,13,19.595,12.961,17.221,19.054,12.229,20.33,10.197,16.732,19.195,12.632,14.427,17.671,13.832,16.995,995.879,0
613,613,A,157.34,material_7,material_8,9,5,4,3,5,18.329,9.545,17.183,18.108,10.17,18.182,11.116,16.256,17.797,,13.793,13.394,13.531,16.026,582.908,0
13711,13711,C,103.78,material_7,material_8,5,8,9,10,4,17.963,12.246,17.32,17.025,11.262,17.439,11.293,15.885,15.762,9.09,16.222,17.976,13.385,15.012,538.756,0


In [3]:
# Drop the id columns
dropped_columns = ['id', 'product_code']

df_train_model = df_train.drop(dropped_columns, axis=1)
df_test_model = df_test.drop(dropped_columns, axis=1)

# Assign train and test for X and y
X_train = df_train_model.drop('failure', axis=1)
y_train = df_train_model['failure']

X_test = df_test_model.copy()

In [4]:
cat_features = ['attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']
continous_features = [cat for cat in X_train.columns if cat not in cat_features]

In [5]:
si = SimpleImputer(strategy='mean')
# Apply preprocessing to the training data
X_train_transformed = si.fit_transform(X_train[continous_features])
X_train_transformed = pd.DataFrame(X_train_transformed, columns=X_train[continous_features].columns)
X_train = pd.concat([X_train[cat_features], X_train_transformed], axis=1)

X_test_transformed = si.transform(X_test[continous_features])
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X_test[continous_features].columns)
X_test = pd.concat([X_test[cat_features], X_test_transformed], axis=1)

X_train.head()

Unnamed: 0,attribute_0,attribute_1,attribute_2,attribute_3,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,material_7,material_8,9,5,80.1,7.0,8.0,4.0,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,16.048444,13.034,14.684,764.1
1,material_7,material_8,9,5,84.89,14.0,3.0,3.0,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057
2,material_7,material_8,9,5,82.43,12.0,1.0,5.0,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,19.172085,13.798,16.711,18.631,14.094,17.946,663.376
3,material_7,material_8,9,5,101.07,13.0,2.0,6.0,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282
4,material_7,material_8,9,5,188.06,9.0,2.0,8.0,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885


In [6]:
target = 'failure'

combined_df = pd.concat([X_train, X_test])
combined_df.sample(2)

Unnamed: 0,attribute_0,attribute_1,attribute_2,attribute_3,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
341,material_5,material_6,6,4,107.41,8.0,11.0,7.0,16.939,12.286,17.313,17.36,11.464,19.024714,11.172,16.071,20.3,11.347,15.961,17.574,12.875,17.252,726.473
9450,material_5,material_6,9,7,113.33,13.0,11.0,2.0,17.716,12.486,17.121,16.388,12.209,18.661,10.849,15.846,18.299,11.378,17.24,18.178,13.666,16.460727,701.269059


In [7]:
X_train['loading_measurement_6'] = X_train['loading'] * X_train['measurement_6']
X_train['loading_measurement_11_ratio'] = X_train['loading'] / np.where(X_train['measurement_11'] == 0, 1e-6, X_train['measurement_11'])

X_test['loading_measurement_6'] = X_test['loading'] * X_test['measurement_6']
X_test['loading_measurement_11_ratio'] = X_test['loading'] / np.where(X_test['measurement_11'] == 0, 1e-6, X_test['measurement_11'])

In [8]:
numerical_features = X_train.select_dtypes(include=['int', 'float']).columns

In [9]:
X_test.sample(5)

Unnamed: 0,attribute_0,attribute_1,attribute_2,attribute_3,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,loading_measurement_6,loading_measurement_11_ratio
7965,material_5,material_6,9,7,131.0,9.0,13.0,5.0,16.644,11.354,17.927,17.264,12.79,20.643,11.443,16.417,17.152,10.823,15.973,16.048444,13.568,16.577,781.216,2261.584,7.637593
651,material_5,material_6,6,4,144.39,15.0,13.0,9.0,19.427,11.325,19.152,16.737,10.407,19.333,12.314,18.281,16.109,11.085,15.652904,15.009,11.957,17.299,589.616,2416.65543,8.963312
15234,material_7,material_7,7,9,106.54,15.0,5.0,4.0,15.337,12.458,18.648,15.849,10.872,18.701,10.717,19.274,15.894,11.899,13.968,17.849,14.995554,17.704,749.06,1688.55246,6.703158
9048,material_5,material_6,9,7,108.36,6.0,4.0,2.0,17.27,10.796,19.251,18.044,10.832,17.803,12.371,16.117711,19.75,12.516,15.517,14.598,14.658,16.063,693.272,1955.24784,5.486582
19075,material_7,material_5,9,5,121.22,1.0,12.0,11.0,17.679,12.311,17.708,18.64,11.131,18.397,10.338,16.199,20.453,9.07,15.699,16.048444,14.36,17.002,619.78,2259.5408,5.926759


In [10]:
X_train.shape

(26570, 25)

In [None]:
# viz.plot_catboost_feature_importance(X_train, y_train, cat_features=cat_features, figsize=(32, 32))

In [None]:
# # Create a blank DataFrame to store the results
# result_df = pd.DataFrame({'Feature': X_train.columns})

In [None]:
# # Calculate the mutual information scores 5 times
# for i in range(5):
#     mi_scores = fe.make_mi_scores_classification(X_train[numerical_features], y_train)
#     result_df[f'Run_{i + 1}'] = mi_scores['MI Score']
#     print(result_df.head(5))

In [None]:
# result_df['Average'] = result_df.iloc[:, 1:].mean(axis=1)
# result_df = result_df.sort_values(by='Average', ascending=False)
# result_df.to_csv(f'mi_result{model_number}.csv', index=False)
# result_df

In [None]:
# plt.figure(figsize=(32, 32))
# sns.barplot(
#     x=result_df['Average'],
#     y=result_df['Feature'],
# )

# plt.title('Average MI Score')
# plt.show()

In [None]:
# fi_df = pd.read_csv('feature_importance.csv')

# # Merge the DataFrames on col_a
# merged_df = fi_df.merge(result_df, on='Feature')

# # Print the merged DataFrame
# merged_df = merged_df[['Feature', 'Average_x', 'Average_y']]

# merged_df['FI_MI'] = merged_df.iloc[:, 1:].mean(axis=1, skipna=True, numeric_only=True)
# merged_df = merged_df.sort_values(by='FI_MI', ascending=False)
# merged_df

In [None]:
# # more_dropped_columns = ['attribute_2', 'measurement_2', 'measurement_7', 'measurement_12', 'measurement_16']
# more_dropped_columns = []

# # model_number = 'catboost_simpleimputer_domains'

In [None]:
# feature_list = merged_df.iloc[:25, :]['Feature'].tolist()
# feature_list

In [None]:
# def get_score(hyperparameter, features):
#     # For different metrics, tuning the hyperparameter and get a range for that parameter that is recorded
#     model = CatBoostClassifier(random_state=hyperparameter, verbose=False)
#     num_folds = 5
#     cv = StratifiedKFold(n_splits=num_folds, random_state=5, shuffle=True)
#     tuning_scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
#     return tuning_scores.mean()

In [None]:
# # Initialize CatBoost model
# # model = CatBoostClassifier(random_state=5, cat_features=['attribute_0', 'attribute_1','attribute_2', 'attribute_3'], verbose=False)
# model = CatBoostClassifier(random_state=5, verbose=False)

In [None]:
# rfecv = RFECV(estimator=model, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# rfecv.fit(X_train_domain[feature_list], y_train)

In [None]:
# # Get the RFECV ranking of all the features to select which ones to use
# # Rank 1 are the features best suited for modelling but rank 2 features aren't bad
# # And so on
# rfecv_features = pd.DataFrame({'Feature': list(X_test_domain[feature_list].columns),
#                                'Ranking': rfecv.ranking_})

# rfecv_features.sort_values(by='Ranking', inplace=True)

# rfecv_features

In [None]:
# # Get the index of Rank 1 features
# selected_features = [i for i, selected in enumerate(rfecv.support_) if selected]

In [None]:
# # Get a list of the rank 1 features
# rfecv_rank_1 = list(X_train_domain[feature_list].columns[selected_features])
# rfecv_rank_1

In [22]:
rfecv_rank_1 = ['loading_measurement_11_ratio', 
                'loading_measurement_6',
                'loading']

param_grid ={
    'learning_rate': [0.004, 0.005, 0.006],
    'depth': [6, 7, 8],
    'l2_leaf_reg': [7, 8, 9],
    'num_trees': [4, 5, 10],
    'subsample': [0.1, 0.2, 0.3],
    'random_state': [0, 5, 42],
}

model = CatBoostClassifier(verbose=False)

In [23]:
num_folds = 5
cv = StratifiedKFold(n_splits=num_folds, random_state=5, shuffle=True)

grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)

In [24]:
grid_search.fit(X_train[rfecv_rank_1], y_train)

In [18]:
# Get the best hyperparameters and corresponding score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best ROC AUC Score:", best_score)

Best Hyperparameters: {'depth': 8, 'l2_leaf_reg': 7, 'learning_rate': 0.004, 'num_trees': 10, 'random_state': 5, 'subsample': 0.3}
Best ROC AUC Score: 0.5880559337180867


In [None]:
# # Initialize an empty dictionary to store the results
# results = {}

# # Iterate through the learning rates and calculate scores
# for param in tuning_params:
#     score = get_score(param, rfecv_rank_1)
#     results[param] = score

# plt.figure(figsize=(12, 12))
# plt.plot(list(results.keys()), list(results.values()))
# plt.show()

In [19]:
model = CatBoostClassifier(random_state=5, learning_rate=0.004, depth=8, l2_leaf_reg=7, num_trees=10, subsample=0.3, verbose=False)

In [20]:
num_folds = 5
cv = StratifiedKFold(n_splits=num_folds, random_state=5, shuffle=True)

scores = cross_val_score(model, X_train[rfecv_rank_1], y_train, cv=cv, n_jobs=-1, scoring='roc_auc')

list(scores), scores.mean(), scores.std()

([0.5766123476543774,
  0.5935976074045246,
  0.584271836240884,
  0.5909229640095433,
  0.5948749132811045],
 0.5880559337180867,
 0.0067926953109121)

In [21]:
# Fit the model on the entire training data
# model.fit(X_train.drop(dropped_columns_low_fi, axis=1), y_train)
model.fit(X_train[rfecv_rank_1], y_train)

# Predict on the X_test data
# predictions = model.predict_proba(X_test.drop(dropped_columns_low_fi, axis=1))[:, 1]
predictions = model.predict_proba(X_test[rfecv_rank_1])[:, 1]

# Create a DataFrame for submission
submission_df = pd.DataFrame(
    {'id': df_test['id'],
    'failure': predictions})

# submission_df.head(50)
# Save to CSV for submission
submission_df.to_csv(f'submission_{model_number}.csv', index=False)