In [5]:

# Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from lightgbm import LGBMClassifier
from optuna.samplers import TPESampler
import optuna
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('Train Dataset .csv')
test = pd.read_csv('Test Dataset.csv')

In [3]:
train.rename(columns={'Id':'id',
                      'Age': 'age',
                      'Sex': 'sex'
             }, inplace= True)

In [4]:
train

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,16167,33,0,1,158,205,1,0,154,0,1.5,1,4,1,1
1,11275,53,1,2,198,154,0,1,104,0,0.8,2,1,0,0
2,13251,37,1,2,101,202,1,0,155,0,2.1,1,3,1,1
3,19921,75,0,0,113,306,1,2,88,1,4.9,0,2,2,1
4,11293,35,1,2,139,419,1,1,166,1,0.9,2,4,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7298,19401,30,1,2,107,177,1,2,119,0,2.7,1,0,0,0
7299,10446,42,1,2,96,551,1,2,76,0,1.9,2,3,2,1
7300,13219,51,1,0,151,165,1,0,190,1,0.9,0,0,2,1
7301,15349,29,0,0,195,287,1,2,161,1,3.4,1,1,0,1


In [6]:
# Plot the distribuition of a column
def plot_distribution(df, column):
    # Calculate value counts
    value_counts = df[column].value_counts()

    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Bar plot on the first subplot
    sns.barplot(x=value_counts.index, y=value_counts.values, palette="viridis", ax=ax1)
    ax1.set_xlabel(column, fontsize=12)
    ax1.set_ylabel('Count', fontsize=12)
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right', fontsize=10)

    # Add data labels above each bar
    for index, value in enumerate(value_counts):
        ax1.text(index, value, str(value), ha='center', va='bottom', fontsize=10)

    # Pie plot on the second subplot
    ax2.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', colors=sns.color_palette("viridis", len(value_counts)))
    ax2.axis('equal')

    # Main title for the figure
    fig.suptitle(f'Comparison of {column} Distribution in Train Dataset', fontsize=18)
    
    # Adjust layout and display the figure
    plt.tight_layout()
    plt.show()

In [7]:
# Plot a histogram of a column with the amount of column for each obesity result
def plot_histograms_and_density(dataframe, column):
    fig, ax = plt.subplots(figsize=(16, 4))
    fig = sns.histplot(data=dataframe, x=column, hue="target", bins=50, kde=True)
    plt.ylim(0,500)
    plt.show()

In [8]:
# plot the histograms of the continuous columns
for column in continuous_vars:
    plot_histograms_and_density(train, column)

NameError: name 'continuous_vars' is not defined

In [10]:
# Get the data set ready
X = train.drop(columns=['target', 'id'])
y = train['target']
test = test.drop(columns=['id'])

In [11]:
# Split the training data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5842, 13)
X_test shape: (1461, 13)
y_train shape: (5842,)
y_test shape: (1461,)


In [21]:
#The tuning process

# Define the objective function for Optuna optimization
def objective(trial, X_train, y_train, X_test, y_test):
    # Define parameters to be optimized for the LGBMClassifier
    param = {
        #"objective": "classification",
        "metric": "accuracy",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "random_state": 42,
        #"num_class": 2,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        "n_estimators": trial.suggest_int("n_estimators", 400, 600),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.015),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.02, 0.06),
        "max_depth": trial.suggest_int("max_depth", 6, 14),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
    }

    # Create an instance of LGBMClassifier with the suggested parameters
    lgbm_classifier = LGBMClassifier(**param)
    
    # Fit the classifier on the training data
    lgbm_classifier.fit(X_train, y_train)

    # Evaluate the classifier on the test data
    score = lgbm_classifier.score(X_test, y_test)

    return score



In [22]:
# Set up the sampler for Optuna optimization
sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object for Optuna optimization
study = optuna.create_study(direction="maximize", sampler=sampler)

# Run the optimization process
study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=100, show_progress_bar=True)

# Get the best parameters after optimization
best_params = study.best_params

print('='*50)
print(best_params)

[I 2024-09-27 18:33:54,035] A new study created in memory with name: no-name-73c3f3fd-83fb-4f07-9e60-bb5b07cf4854


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-09-27 18:33:55,403] Trial 0 finished with value: 0.8172484599589322 and parameters: {'learning_rate': 0.0249816047538945, 'n_estimators': 591, 'lambda_l1': 0.012319939418114049, 'lambda_l2': 0.04394633936788146, 'max_depth': 7, 'colsample_bytree': 0.3935967122017216, 'subsample': 0.8116167224336399, 'min_child_samples': 45}. Best is trial 0 with value: 0.8172484599589322.
[I 2024-09-27 18:33:56,757] Trial 1 finished with value: 0.8268309377138946 and parameters: {'learning_rate': 0.034044600469728355, 'n_estimators': 542, 'lambda_l1': 0.005205844942958024, 'lambda_l2': 0.05879639408647977, 'max_depth': 13, 'colsample_bytree': 0.4274034664069657, 'subsample': 0.8363649934414201, 'min_child_samples': 17}. Best is trial 1 with value: 0.8268309377138946.
[I 2024-09-27 18:33:57,795] Trial 2 finished with value: 0.8281998631074606 and parameters: {'learning_rate': 0.02216968971838151, 'n_estimators': 505, 'lambda_l1': 0.009319450186421156, 'lambda_l2': 0.03164916560792168, 'max_depth

[I 2024-09-27 18:34:25,475] Trial 22 finished with value: 0.8295687885010267 and parameters: {'learning_rate': 0.022778186493961516, 'n_estimators': 429, 'lambda_l1': 0.010868333177537772, 'lambda_l2': 0.054760775634552075, 'max_depth': 7, 'colsample_bytree': 0.42854440557199364, 'subsample': 0.9748303281684237, 'min_child_samples': 33}. Best is trial 4 with value: 0.8302532511978097.
[I 2024-09-27 18:34:26,768] Trial 23 finished with value: 0.8193018480492813 and parameters: {'learning_rate': 0.031246890512640613, 'n_estimators': 429, 'lambda_l1': 0.01085586827095706, 'lambda_l2': 0.05392920479638577, 'max_depth': 7, 'colsample_bytree': 0.44121991384822334, 'subsample': 0.9756120174035174, 'min_child_samples': 33}. Best is trial 4 with value: 0.8302532511978097.
[I 2024-09-27 18:34:27,715] Trial 24 finished with value: 0.8234086242299795 and parameters: {'learning_rate': 0.03600339380341649, 'n_estimators': 415, 'lambda_l1': 0.013498285580974639, 'lambda_l2': 0.05989879313923893, 'max

[I 2024-09-27 18:34:54,369] Trial 44 finished with value: 0.8227241615331964 and parameters: {'learning_rate': 0.03936648124771728, 'n_estimators': 485, 'lambda_l1': 0.008507777983690795, 'lambda_l2': 0.0427554228972516, 'max_depth': 8, 'colsample_bytree': 0.32237005368219995, 'subsample': 0.9401758693208837, 'min_child_samples': 23}. Best is trial 4 with value: 0.8302532511978097.
[I 2024-09-27 18:34:55,645] Trial 45 finished with value: 0.8288843258042436 and parameters: {'learning_rate': 0.0224491807680692, 'n_estimators': 437, 'lambda_l1': 0.009576412194321463, 'lambda_l2': 0.04928575259933143, 'max_depth': 8, 'colsample_bytree': 0.3698884737919286, 'subsample': 0.9131033293117431, 'min_child_samples': 15}. Best is trial 4 with value: 0.8302532511978097.
[I 2024-09-27 18:34:56,862] Trial 46 finished with value: 0.8220396988364134 and parameters: {'learning_rate': 0.035951899196353776, 'n_estimators': 474, 'lambda_l1': 0.012002798167680169, 'lambda_l2': 0.054309689754902746, 'max_de

[I 2024-09-27 18:35:22,754] Trial 66 finished with value: 0.8281998631074606 and parameters: {'learning_rate': 0.019334144517315223, 'n_estimators': 415, 'lambda_l1': 0.011087299101290273, 'lambda_l2': 0.039309465533875224, 'max_depth': 11, 'colsample_bytree': 0.4063776711575963, 'subsample': 0.9439027394672187, 'min_child_samples': 25}. Best is trial 4 with value: 0.8302532511978097.
[I 2024-09-27 18:35:23,952] Trial 67 finished with value: 0.8165639972621492 and parameters: {'learning_rate': 0.010038290086972995, 'n_estimators': 425, 'lambda_l1': 0.010695125059396822, 'lambda_l2': 0.045949530013574234, 'max_depth': 9, 'colsample_bytree': 0.3512103501680228, 'subsample': 0.913521336540196, 'min_child_samples': 32}. Best is trial 4 with value: 0.8302532511978097.
[I 2024-09-27 18:35:25,367] Trial 68 finished with value: 0.8145106091718002 and parameters: {'learning_rate': 0.01559229326044484, 'n_estimators': 583, 'lambda_l1': 0.010227047365796656, 'lambda_l2': 0.05375992104009446, 'max

[I 2024-09-27 18:35:51,565] Trial 88 finished with value: 0.8261464750171116 and parameters: {'learning_rate': 0.021940589144849004, 'n_estimators': 557, 'lambda_l1': 0.012197158120029302, 'lambda_l2': 0.057564125290149516, 'max_depth': 7, 'colsample_bytree': 0.4037059270716613, 'subsample': 0.9307284540360904, 'min_child_samples': 28}. Best is trial 70 with value: 0.8309377138945927.
[I 2024-09-27 18:35:53,054] Trial 89 finished with value: 0.8281998631074606 and parameters: {'learning_rate': 0.015577435767533352, 'n_estimators': 587, 'lambda_l1': 0.009386739240212456, 'lambda_l2': 0.03427990630625381, 'max_depth': 9, 'colsample_bytree': 0.37416728870385096, 'subsample': 0.8943953836270606, 'min_child_samples': 24}. Best is trial 70 with value: 0.8309377138945927.
[I 2024-09-27 18:35:54,244] Trial 90 finished with value: 0.8220396988364134 and parameters: {'learning_rate': 0.013010922010471587, 'n_estimators': 439, 'lambda_l1': 0.005000734126731096, 'lambda_l2': 0.029147449343912807, 

In [23]:
best_params

{'learning_rate': 0.01821703116658526,
 'n_estimators': 452,
 'lambda_l1': 0.009825957071267285,
 'lambda_l2': 0.04668177988264573,
 'max_depth': 8,
 'colsample_bytree': 0.42073204143753434,
 'subsample': 0.9420476585675465,
 'min_child_samples': 22}

In [24]:
lgbm_classifier = LGBMClassifier(**best_params)
lgbm_classifier.fit(X_train, y_train)
y_pred = lgbm_classifier.predict(X_test)
accuracy_score(y_test, y_pred) 

[LightGBM] [Info] Number of positive: 4740, number of negative: 1102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 5842, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.811366 -> initscore=1.458910
[LightGBM] [Info] Start training from score 1.458910




0.8254620123203286

In [25]:
confusion_matrix(y_test, y_pred)

array([[  98,  162],
       [  93, 1108]], dtype=int64)

In [26]:
pred = lgbm_classifier.predict(test)



In [27]:
df_sub = pd.read_csv('Sample Submission.csv')
df_sub['target'] = pred
df_sub.to_csv('submission2.csv',index=False)