# Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import RandomizedSearchCV
import scipy
import warnings

/kaggle/input/playground-series-s3e26/sample_submission.csv
/kaggle/input/playground-series-s3e26/train.csv
/kaggle/input/playground-series-s3e26/test.csv


# Importing Data

In [2]:
train_data = pd.read_csv('/kaggle/input/playground-series-s3e26/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e26/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s3e26/sample_submission.csv')

# Familiarising with the Data

In [3]:
train_data.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [4]:
train_data.shape

(7905, 20)

So there are 7905 training examples and each example has 20 attributes.
We will look for null values in our data.

In [5]:
print(train_data.isnull().sum())

id               0
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64


Great! <br>
There are no missing values in our data set.

# Feature Scaling
We will scale our numeric data.<br>
First, let's find the numeric data in our dataset.

In [6]:
print(train_data.select_dtypes(include='number').columns)

Index(['id', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
       'Stage'],
      dtype='object')


The attributes which are numeric in our dataset are
* N_Days
* Age
* Bilirubin
* Cholestrol
* Albumin
* Copper
* Alk_Phos
* SGOT
* Triglicerides
* Platelets
* Prothrombin
* Stage

We will be scaling these attributes using the MinMaxScaler.

In [7]:
numeric_cols = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
       'Stage']
scaler = MinMaxScaler()
train_data[numeric_cols] = scaler.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = scaler.fit_transform(test_data[numeric_cols])
train_data.head()
test_data.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,0.798906,D-penicillamine,0.531493,F,N,Y,N,N,0.032491,0.257402,0.526119,0.104452,0.099238,0.291367,0.100885,0.734531,0.258065,0.333333
1,7906,0.510517,D-penicillamine,0.282228,F,N,N,N,N,0.028881,0.326284,0.843284,0.15411,0.071316,0.291367,0.215929,0.329341,0.16129,0.333333
2,7907,0.002103,Placebo,0.186385,F,N,Y,N,Y,0.061372,0.018731,0.373134,0.071918,0.049509,0.100719,0.120354,0.301397,0.645161,1.0
3,7908,0.481489,D-penicillamine,0.572748,F,N,N,N,N,0.01083,0.104532,0.705224,0.061644,0.019523,0.230216,0.040708,0.41517,0.258065,0.333333
4,7909,0.33109,D-penicillamine,0.645916,F,N,Y,N,N,0.039711,0.094864,0.376866,0.200342,0.060486,0.228939,0.164602,0.317365,0.129032,0.0


Our data is nicely normalized now.

# Model Building
We are going to use LightGBM for this model. We will make a gradient boosting machine to detect liver cirrhosis among the patients. <br>
First we will take a look at the categorical data in our training set.

In [8]:
print(train_data.select_dtypes(include = 'category').columns)

Index([], dtype='object')


So the attributes which have categorical data are
- Drug
- Sex
- Ascites
- Hepatomegaly
- Spiders
- Edema 
- Status

In [9]:
categorical_cols1 = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Status']
categorical_cols2 = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

Converting the categorical data to numeric using Label Encoder.

In [10]:
le = LabelEncoder()
for col in categorical_cols1:
    train_data[col] = le.fit_transform(train_data[col])
    
for col in categorical_cols2:
    test_data[col] = le.fit_transform(test_data[col])

We will now make categorical columns for the categorical data.

In [11]:
train_data[categorical_cols1] = train_data[categorical_cols1].astype('category')
test_data[categorical_cols2] = test_data[categorical_cols2].astype('category')

We will be making 2 dataframes X(training set) and y(target values). The X and y dataframes are then split into training sets and dev sets respectively.

In [12]:
X = train_data.drop('Status', axis=1)
y = train_data['Status']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

Now we will do some **feature engineering** to make our data even better.

In [13]:
poly = PolynomialFeatures(degree=2, include_bias=True)
X_train_poly = poly.fit_transform(X_train)
X_valid_poly = poly.fit_transform(X_valid)
test_data_poly = poly.fit_transform(test_data)

Follwing that we are going to be making the LightGBM Classifier, which we will use to train the classifier.

In [14]:
clf = LGBMClassifier(
    objective='multiclass',
    num_class=len(train_data['Status'].unique()),
    metric='multi_logloss',
    boosting_type='gbdt',
    colsample_bytree=0.024440530059814475,
    lambda_l1=0.851568757271025,
    lambda_l2=0.6986338677174764,
    learning_rate=0.07322492408954809,
    max_depth=7,
    min_child_samples=8,
    min_child_weight=12.618916687176323,
    min_gain_to_split=0.022185549105773583,
    n_estimators=156,
    num_leaves=23,
    subsample=0.555311369173475,
    random_state=42
)

After this, we will fit the data into the training sets.

In [15]:
clf.fit(
    X_train_poly, y_train,
)



After fitting, we predict for the validation set.

In [16]:
y_pred_proba = clf.predict_proba(X_valid_poly)

We will now generate a log loss score over our validation set to see how our model is performing.

In [17]:
log_loss_score = log_loss(y_valid, clf.predict_proba(X_valid_poly))
print(f"Log Loss: {log_loss_score}")

Log Loss: 0.4384276763381177


# Hyperparameter Tuning
We will tune our hyperparameters using GridSearchCV.<br>
First we will make a list of initial parameters.

In [18]:
initial_params = {
    'colsample_bytree':0.13126831193274197,
    'lambda_l1':0.9052730145788052,
    'lambda_l2':0.7535355957473608,
    'learning_rate':0.09633124078144252,
    'max_depth':9,
    'min_child_samples':5,
    'min_child_weight':12.505529390593734,
    'min_gain_to_split':0.253732206404071,
    'n_estimators':96,
    'num_leaves':24,
    'subsample':0.44919946959336043,
}

Next, we will make a dictionary which will contain the range of values which RandomizedSearchCV will use.

In [19]:
param_dist = {
    'max_depth': scipy.stats.randint(low=initial_params['max_depth'] - 2, high=initial_params['max_depth'] + 2),
    'min_child_samples': scipy.stats.randint(low=initial_params['min_child_samples'] - 2, high=initial_params['min_child_samples'] + 2),
    'min_child_weight': scipy.stats.uniform(loc=initial_params['min_child_weight'] - 2, scale=4),
    'n_estimators': scipy.stats.randint(low=80, high=200),
    'lambda_l1': scipy.stats.uniform(loc=max(0, initial_params['lambda_l1'] - 0.2), scale=0.4),
    'lambda_l2': scipy.stats.uniform(loc=max(0, initial_params['lambda_l2'] - 0.2), scale=0.4),
    'min_gain_to_split': scipy.stats.uniform(loc=initial_params['min_gain_to_split'] - 0.2, scale=0.4),  # Adjusted range for min_gain_to_split
    'num_leaves': scipy.stats.randint(low=initial_params['num_leaves'] - 5, high=initial_params['num_leaves'] + 5),
    'learning_rate': scipy.stats.uniform(loc=initial_params['learning_rate'] - 0.02, scale=0.04),
    'colsample_bytree': scipy.stats.uniform(loc=initial_params['colsample_bytree'] - 0.1, scale=0.2),
    'subsample': scipy.stats.uniform(loc=initial_params['subsample'] - 0.1, scale=0.2),
}

We will then use the RandomizedSearchCV to find the better hyperparameters.

In [20]:
%%time
random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=5,
    scoring='neg_log_loss',
    cv=10,
    n_jobs=-1
)
random_search.fit(X_train_poly, y_train)

CPU times: user 3min 20s, sys: 1min 14s, total: 4min 34s
Wall time: 2min 34s


In [21]:
print("The best hyperparameters are")
for key, value in random_search.best_params_.items():
    print(f"    '{key}':{value},")

The best hyperparameters are
    'colsample_bytree':0.039428481081836456,
    'lambda_l1':0.8601081651545851,
    'lambda_l2':0.8822533541647981,
    'learning_rate':0.08579379178727162,
    'max_depth':7,
    'min_child_samples':5,
    'min_child_weight':11.33502865866598,
    'min_gain_to_split':0.3431799022456703,
    'n_estimators':139,
    'num_leaves':21,
    'subsample':0.357765077818825,


We will create a function which formats our best parameters dictionary into a list so that we can plug it directly into the GBMClassfier.

In [22]:
def dict_to_list(best_hyperparameters):
    formatted_list = [f"    {key}={value}," if not isinstance(value, int) else f"    {key}={value}," for key, value in best_hyperparameters.items()]
    return '\n'.join(formatted_list)

par_list = dict_to_list(random_search.best_params_)
print(par_list)

    colsample_bytree=0.039428481081836456,
    lambda_l1=0.8601081651545851,
    lambda_l2=0.8822533541647981,
    learning_rate=0.08579379178727162,
    max_depth=7,
    min_child_samples=5,
    min_child_weight=11.33502865866598,
    min_gain_to_split=0.3431799022456703,
    n_estimators=139,
    num_leaves=21,
    subsample=0.357765077818825,


The hyperparameters recieved are then plugged into the LightGBM model.

# Predictions

Now we will predict the data using the data in the test set test set.

In [23]:
ans = clf.predict_proba(test_data_poly)

# Submission

In [24]:
test_id = test_data['id']
submission_df = pd.DataFrame({
    'id': test_id,
    'Status_C': ans[:, 0],
    'Status_CL': ans[:, 1],
    'Status_D': ans[:, 2]
})
submission_df.to_csv('submission.csv', index=False)
print('Your submission was saved.')

Your submission was saved.


Finally the data is saved into the submission file.