In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install langdetect
# !pip install deep_translator
!pip install bayesian-optimization

In [None]:
#import required packages
#basics
import pandas as pd 
import numpy as np

#misc
import gc
import time
import warnings
from joblib import dump
import logging
import subprocess
from copy import deepcopy
from tqdm import tqdm

#stats
import scipy
import scipy.stats as ss

#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image
import matplotlib_venn as venn

#nlp
import string
import re    #for regex


import spacy
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import word_tokenize, TweetTokenizer
# Make sure to download the needed resources if you haven't already.
# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet


#FeatureEngineering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import log_loss, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
# from langdetect import detect
# from deep_translator import GoogleTranslator

#Modelling
import xgboost as xgb
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization

Initialising functions that we'll use.

In [None]:
def read_data():
    train = pd.read_csv('../input/jigsaw-toxic-comment-classification/train.csv').fillna("unknown")
    test = pd.read_csv('../input/jigsaw-toxic-comment-classification/test.csv').fillna("unknown")
    return train, test

def feature_engineering(data):
#     for i,c in enumerate(train['comment_text']):
#         try:
#             if detect(c) != 'en':
#                 train.iloc[i,1] = GoogleTranslator(source='auto', target='en').translate(c)
#         except:
#             train.iloc[i,1] = "error"
#             print(f"This row throws and error: {i}")
#             break

    for element in data:           
        element['total_length'] = element['comment_text'].apply(len)
        element['capitals'] = element['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
        element['caps_vs_length'] = element.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                    axis=1)
        element['num_exclamation_marks'] = element['comment_text'].apply(lambda comment: comment.count('!'))
        element['num_question_marks'] = element['comment_text'].apply(lambda comment: comment.count('?'))
        element['num_punctuation'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
        element['num_symbols'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
        element['num_words'] = element['comment_text'].apply(lambda comment: len(comment.split()))
        element['num_unique_words'] = element['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
        element['words_vs_unique'] = element['num_unique_words'] / element['num_words']
        element['num_smilies'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
        # ... (other features like capitals, caps_vs_length etc.)
        # No changes here, this part looks good.
        
    return data

def scale_features(data, col):
    scaler = MinMaxScaler()
    
    for element in data:
        scaler_model = scaler.fit(element[col])
        element[col] = scaler_model.fit_transform(element[col])
        
    return data


stop_words_set = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

url_pattern = re.compile(r'http\S+|www\S+|https\S+', flags=re.MULTILINE)
punctuation_pattern = re.compile(r'[^\w\s]')
special_characters_pattern = re.compile(r'[^\x00-\x7F]+')
extra_spaces_pattern = re.compile(r'\s+')

def clean_tweet(tweet):
    """
    Cleans the text by removing links, special characters, etc.
    
    Parameters:
        tweet (str): The tweet text to clean.
        
    Returns:
        str: The cleaned tweet text.
    """
    # Convert to lower case
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = url_pattern.sub('', tweet)
    
    # Remove punctuation
    tweet = punctuation_pattern.sub('', tweet)

    # Remove special characters, numbers, etc.
    tweet = special_characters_pattern.sub('', tweet)
    
    # Remove extra spaces
    tweet = extra_spaces_pattern.sub(' ', tweet).strip()
    
    # Tokenize
    tweet_tokens = tokenizer.tokenize(tweet)
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word, "v") for word in tweet_tokens if word not in stop_words_set]
    
    # Join tokens into a string
    clean_sent = " ".join(cleaned_tokens)
    
    return clean_sent

  

def build_tfidf(train_text):
    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9,
        strip_accents='unicode',
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1
    )
    return tfidf_vectorizer.fit(train_text)

def build_model(train_X, train_y, test_X, test_y=None):
    # Your XGBoost model parameters
    # ...
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    params = list(param.items())
    
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(params, xgtrain, num_rounds)
    
    return model



Read the data into memory. Before splitting into training and validation sets, we will do some feature engineering, creating new features. We will translate the comments into english if they're not already in english. Then, the following features are engineered:

 ['total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks', 'num_question_marks', 'num_punctuation', 'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique', 'num_smilies'] 
 
For these features, we'll also normalise them to ensure there are no wide-ranging values that'll degrade the model's performance.

In [None]:
train, test = read_data()
# train_mes, valid_mes, train_l, valid_l = split_data(train)

# Feature Engineering
train_features, test_features = feature_engineering([train,test])

for df in [train_features, test_features]:
    cols_with_nan = df.columns[df.isna().any()].tolist()
        
    for col in cols_with_nan:
        if pd.notna(df[col]).any(): # Check if there is any non-NaN value in the column
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna(0, inplace=True)



In [None]:
train.head()

In [None]:
test_features.isna().any()

We can now visualise the new features that we've craeted are added to the DataFrame. Since we don't need the `id` column in prediction, we will drop them in both the train_features and test_features dataframe.

In [None]:
train_features.head()
train_features, test_features = train_features.drop('id', axis=1), test_features.drop('id', axis=1)

In [None]:
train_features.head()

We extract the target labels and training features to be used in training our model. As there are numeric features, we need to scale them to avoid wide-ranges. We can use sklearn's MinMaxScaler to achieve this.

In [None]:
labels = train_features.columns[1:7].tolist()
col = train_features.columns[7:].tolist()
print(f'training features: {col} \ntarget labels: {labels}')

In [None]:
from copy import deepcopy
dummy = deepcopy(train_features)
scaler = MinMaxScaler()
scaler_model = scaler.fit(train_features[col])
dummy[col] = scaler_model.fit_transform(dummy[col])

We can now view that they're normalised to a min-max range.

In [None]:
dummy.head()

We'll use our predefined feature_scaler function to scale all the training data for train, validation and test sets. The reason behind this is to avoid training-serving skew in the feature vector.

In [None]:
train_features, test_features = scale_features([train_features,test_features], col)

Now, let's transform them into a TF-IDF vector. We will use the whole corpus to train the TF-IDF model. We will need to vectorise both the train and test set into a TF-IDF matrix to avoid training-serving skew in features. When using a TF-IDF vectorizer, it is better to fit the vectorizer on the training set only and then transform both the training and testing sets separately 123. This is because if we fit the vectorizer on the whole corpus, we might introduce data leakage and hence yield in too optimistic performance measures 2. The IDF-part of the training set’s TF-IDF features will then include information from the test set already. Remember that training sets are used for learning purposes (learning is achieved through fit ()) while testing set is used in order to evaluate whether the trained model can generalize well to new unseen data points 1.

Therefore, it is recommended to use fit the TF-IDF vectorizer on the training set only and then use transform on the testing set.


In [None]:
tqdm.pandas()
train_features['clean_comment_text'] = train_features['comment_text'].progress_apply(clean_tweet)
test_features['clean_comment_text'] = test_features['comment_text'].progress_apply(clean_tweet)


In [None]:

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=10000).fit(train_features['clean_comment_text'])
tfidf_matrix = tfidf_vectorizer.transform(train_features['clean_comment_text'])
tfidf_matrix_test = tfidf_vectorizer.transform(test_features['clean_comment_text'])

# # Split data into training and validation sets
# train_features, valid_features, train_labels, valid_labels = train_test_split(
#     scipy.sparse.hstack([tfidf_matrix, train_features[col].values]),
#     train_features[labels],
#     test_size=0.2,
#     random_state=42
# )

Now, let's combine the TF-IDF matrices with the existing features to form the training and test sets.

In [None]:
X_train = scipy.sparse.hstack([tfidf_matrix, train_features[col].values])
X_test = scipy.sparse.hstack([tfidf_matrix_test, test_features[col].values])

print(f"Shape of X_train: {X_train.shape} \n"
     f"Shape of X_test: {X_test.shape}")

# Hyperparameter Tuning
This section is optional

As we can see, we have 49 dimensions of features. If we were to perform a Bayesian optimisation to find the optimal hyperparameters, it will struggle to iterate through and will take a very, very long time. Although we can technically do that for an optimal result, however given the constraint that I'm running on this Kaggle notebook and it'll timeout, I will be trading off features for training time. Thus, we need to perform a feature engineering technique called: Dimensionality reduction. We can achieve this with either:

1. PCA
2. T-SNE

For this case, we are going to use PCA for dimensionality reduction. In order to find the optimal principal components to balance between the dimensionality vs explained variance ratio tradeoff, we're going to use the `Scree Plot`. We can determine the optimal principal components through a either the `Elbow method`, or a sensible number of principal components under `n=20`

In [None]:
# PCA for sparse matrix
pca = TruncatedSVD(n_components=20)
X = pca.fit_transform(X_train)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

plt.plot(range(1, 21), cumulative_variance_ratio, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Scree Plot')
plt.grid(True)
plt.show()


We'll pick `n=20` to strike a balance between dimensionality and variance.

In [None]:
pca = TruncatedSVD(n_components=20)
X_train_transformed = pca.fit_transform(X_train)

In [None]:
X_test_transformed = pca.fit_transform(X_test)

In [None]:
# Split data into training and validation sets
train_features_transformed, valid_features_transformed, train_labels, valid_labels = train_test_split(
    X_train_transformed,
    train_features[labels],
    test_size=0.2,
    random_state=42
)

Run this block to perform hyperparamter tuning to get the best performant parameters for our XGBoost model.

In [None]:
from bayes_opt import UtilityFunction
# Bayesian Optimization function for xgboost

def xgboost_hyper_param(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, gamma, label_index):
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    label = labels[label_index]
    
    clf = XGBClassifier(
        objective='binary:logistic',
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        eval_metric='auc',  # Change to a suitable metric for multilabel classification
        n_jobs=-1)  # Use all available cores
    
    return np.mean(cross_val_score(clf, train_features_transformed, train_labels[label], cv=3, scoring='roc_auc', n_jobs=-1))
 
# Bayesian Optimization
pbounds = {
    'learning_rate': (0.001, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (5, 30),
    'subsample': (0.7,1.0),
    'colsample_bytree': (0.7, 1.0),
    'gamma': (0, 5)}

# Initialize UtilityFunction with appropriate parameters
utility_function = UtilityFunction(kind='ucb', kappa=3)

# Dictionary to store the optimal parameters for each label
optimal_params_per_label = {}

for i, label in enumerate(labels):
    print(f"Optimizing for label: {label}")
    optimizer = BayesianOptimization(
        f=lambda learning_rate, n_estimators, max_depth, subsample, colsample_bytree, gamma: 
        xgboost_hyper_param(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, gamma, label_index=i), 
        pbounds=pbounds, 
        random_state=1)
    
    optimizer.maximize(init_points=2, n_iter=5, acquisition_function=utility_function)
    
    # Store the optimal parameters found for the current label
    optimal_params = optimizer.max['params']
    optimal_params['max_depth'] = int(optimal_params['max_depth'])
    optimal_params['n_estimators'] = int(optimal_params['n_estimators'])
    
    optimal_params_per_label[label] = optimal_params

# Save optimal parameters to a file or print them
print("Optimal parameters per label:")
for label, params in optimal_params_per_label.items():
    print(f"{label}: {params}")

# Optionally, you can save the optimal parameters to a file for future reference
import json
with open('optimal_params.json', 'w') as f:
    json.dump(optimal_params_per_label, f)




In [None]:
with open('optimal_params.json', 'w') as f:
    json.dump(optimal_params_per_label, f)

If we have a saved version of the optimal parameters, we'll load and parse the `.json` and use them. To save time here, I'm using a set of hard-coded hyperparameters that I've obtained from a previous session.

In [None]:
optimal_params_per_label = {
	"toxic": {
		"colsample_bytree": 0.8852800630620977,
		"gamma": 3.498336235897155,
		"learning_rate": 0.08419519215274332,
		"max_depth": 10,
		"n_estimators": 231,
		"subsample": 0.8043446711262454
	},
	"severe_toxic": {
		"colsample_bytree": 0.8852800630620977,
		"gamma": 3.498336235897155,
		"learning_rate": 0.08419519215274332,
		"max_depth": 10,
		"n_estimators": 231,
		"subsample": 0.8043446711262454
	},
	"obscene": {
		"colsample_bytree": 0.8852800630620977,
		"gamma": 3.498336235897155,
		"learning_rate": 0.08419519215274332,
		"max_depth": 10,
		"n_estimators": 231,
		"subsample": 0.8043446711262454
	},
	"threat": {
		"colsample_bytree": 0.9310026672723213,
		"gamma": 1.0984383363279045,
		"learning_rate": 0.01693249517530996,
		"max_depth": 16,
		"n_estimators": 478,
		"subsample": 0.8958329525959229
	},
	"insult": {
		"colsample_bytree": 0.8852800630620977,
		"gamma": 3.498336235897155,
		"learning_rate": 0.08419519215274332,
		"max_depth": 10,
		"n_estimators": 231,
		"subsample": 0.8043446711262454
	},
    "identity_hate": {
        "colsample_bytree": 0.9,
        "gamma": 3.498336235897155,
        "learning_rate": 0.1,
        "max_depth": 10,
        "n_estimators": 500,
        "subsample": 0.8043446711262454
    }
}

### Train test split

We split the training data into training and validation sets. Since it is a multilabel problem, although we really should be using a stratified split, we are not going to do it here, as the size of the dataset is quite large and it will be practically unfeasible, given this case of a multilabel problem.

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(
    X_train,
    train_features[labels],
    test_size=0.2,
    random_state=42
)

In [None]:
# Train Model with optimized parameters

param = {}
param['objective'] = 'binary:logistic'
param['learning_rate'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['n_estimators'] = 500



models = {}  # Initialize the dictionary to store your models
for label in labels:
#     params = optimal_params_per_label[label]
    model = XGBClassifier(**param)
    
    # Prepare the evaluation sets with only one label column at a time
    train_eval_set = (x_train, y_train[label])
    valid_eval_set = (x_valid, y_valid[label])
    
    # The eval_set parameter should be a list of (X, y) pairs
    eval_set = [train_eval_set, valid_eval_set]
    
    model.fit(
        X=x_train, 
        y=y_train[label], 
        eval_metric='auc',  # or any other suitable metric
        eval_set=eval_set,
        verbose=True,
        early_stopping_rounds=25
    )
    
    models[label] = model





### Model Evaluation

In [None]:
from prettytable import PrettyTable

def print_metrics(models, features, labels, label_names):
    # Initialize a PrettyTable object
    table = PrettyTable()
    
    # Define column names
    table.field_names = ["Label", "Log Loss", "AUC", "Precision", "Recall", "F1 Score"]
    
    # Loop over each label and calculate metrics, then add a row to the table for each label
    for label in label_names:
        y_pred_proba = models[label].predict_proba(features)[:, 1]
        y_true = labels[label]
        y_pred = models[label].predict(features)
        
        loss = log_loss(y_true, y_pred_proba)
        auc = roc_auc_score(y_true, y_pred_proba)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        report = classification_report(y_true, y_pred)
        
        # Add a row to the table
        table.add_row([label, f"{loss:.4f}", f"{auc:.4f}", f"{precision:.4f}", f"{recall:.4f}", f"{f1:.4f}"])
        print(f"{label} \n{report}")
    
    # Print the table
    print(table)

# Call the function with appropriate arguments
print_metrics(models, x_valid, y_valid, labels)

# for label in labels:
#     y_pred = models[label].predict_proba(valid_features_transformed)[:, 1]
#     loss = log_loss(valid_labels[label], y_pred)
#     print(f"Log Loss for {label}: {loss}")


Looks like we have trouble discriminating False Negatives with a low Recall score. This is largely due to how we are approaching the problem in this case. In a multilabel problem, the binary classifier will perform poorly on `Precision` and `Recall`, as it will try to predict the majority class. In other cases, such as a multi-class classification, we can use the `scale_pos_weight` parameter of XGBClassifier to control the balance of positive and negative weights.

### Model Prediction

In [None]:
preds = np.zeros((test.shape[0], len(labels)))
for i, label in enumerate(labels):
    print('fit '+ label)
    preds[:,i] = models[label].predict(X_test)
    
subm = pd.read_csv('../input/jigsaw-toxic-comment-classification/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = labels)], axis=1)
submission.to_csv('xgb.csv', index=False)

# Further improvements

At submission, our models scored 0.74 on the private and public leaderboards. How can we improve?

1. We have observed that some of the classes have low recall. These are namely: `severe_toxic`, `threat`, and `identity_hate`. Three share a common issue: lack of data for the positive case. We need more DATA. But how? In NLP, SMOTE seem to be problematic here for some reasons: SMOTE works in feature space. It means that the output of SMOTE is not a synthetic data which is a real representative of a text inside its feature space. On one side SMOTE works with KNN and on the other hand, feature spaces for NLP problem are dramatically huge. KNN will easily fail in those huge dimensions. We can engineer an oversampling method, similar to SMOTE. The work around is:

* Ignore the major class. Get a length distribution of all documents in minor class so that we generate new samples according the the true document length (number of words/phrases). We assume we want to make the size of class triple (so producing k=2 synthetic documents per original document)
* Generate a sequence of n random integers according to that distribution. It's used for determining the length of new synthetic document.
* For each document: Choose one integer from random length sequence and m random document whose length is close to the integer. Put tokens of all m documents in a set and randomly choose n tokens k times. these are your k new documents.
* OR: For sentences in minority labels, do a permutation of each and every word with its TOP_N similar words

2. Check for Data Drift / Training-serving Distribution skews

### Save each model

In [None]:
from joblib import dump

for label in labels:
    dump(models[label], f'xgboost_model_{label}.joblib')


In [None]:
# # Model Training and Prediction
# col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# preds = np.zeros((test.shape[0], len(col)))

# for i, j in enumerate(col):
#     print(f'Fitting label {j}')
#     model = build_model(comments_train, train_l[j], comments_valid, valid_l[j])
#     preds[:, i] = model.predict(xgb.DMatrix(comments_test), ntree_limit=model.best_ntree_limit)

#     # Optional: Save the model for future use
#     dump(model, f'xgb_model_{j}.joblib')

# # Create Submission
# # ... (Your existing code to create submission)

Mainblock for a script.

In [None]:
# def main():
#     train, test = read_data()
#     train_mes, valid_mes, train_l, valid_l = split_data(train)
    
#     # Feature Engineering
#     train_mes, valid_mes, test = feature_engineering([train_mes, valid_mes, test])
    
#     # TF-IDF Transformation
#     tfidf_vectorizer = build_tfidf(train['comment_text'])
#     comments_train = tfidf_vectorizer.transform(train_mes)
#     comments_valid = tfidf_vectorizer.transform(valid_mes)
#     comments_test = tfidf_vectorizer.transform(test['comment_text'])
    
#     # Concatenate Features
#     # ... (Your existing code to concatenate features)
    
#     # Model Training and Prediction
#     col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
#     preds = np.zeros((test.shape[0], len(col)))
    
#     for i, j in enumerate(col):
#         print(f'Fitting label {j}')
#         model = build_model(comments_train, train_l[j], comments_valid, valid_l[j])
#         preds[:, i] = model.predict(xgb.DMatrix(comments_test), ntree_limit=model.best_ntree_limit)
        
#         # Optional: Save the model for future use
#         dump(model, f'xgb_model_{j}.joblib')
    
#     # Create Submission
#     # ... (Your existing code to create submission)

# if __name__ == "__main__":
#     main()
