In [2]:
pip install flaml

Collecting flaml
  Downloading FLAML-1.2.4-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.5/260.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: flaml
Successfully installed flaml-1.2.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from pycaret.classification import *
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from scipy.stats import randint as sp_randint
from flaml import AutoML

In [2]:
# Load the data
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)

# Handle missing values in test set
for col in ['EC3', 'EC4', 'EC5', 'EC6']:
    if col not in test.columns:
        test[col] = 'missing'

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train.drop(['EC1', 'EC2'], axis=1), train[['EC1', 'EC2']], test_size=0.2, random_state=42)

# Define the models
catboost = CatBoostClassifier()
lightgbm = LGBMClassifier()
automl = AutoML()

In [3]:
# Train the models for EC1
catboost.fit(X_train, y_train['EC1'])
lightgbm.fit(X_train, y_train['EC1'])

# Get the prediction probabilities for EC1
catboost_probs = catboost.predict_proba(X_val)[:, 1]
lightgbm_probs = lightgbm.predict_proba(X_val)[:, 1]

# Average the probabilities
avg_probs_EC1 = (catboost_probs + lightgbm_probs) / 2

Learning rate set to 0.02963
0:	learn: 0.6828198	total: 71.7ms	remaining: 1m 11s
1:	learn: 0.6576028	total: 83.3ms	remaining: 41.6s
2:	learn: 0.6356844	total: 90.8ms	remaining: 30.2s
3:	learn: 0.6277486	total: 102ms	remaining: 25.5s
4:	learn: 0.6192501	total: 114ms	remaining: 22.7s
5:	learn: 0.6089390	total: 133ms	remaining: 22.1s
6:	learn: 0.5993850	total: 147ms	remaining: 20.9s
7:	learn: 0.5933277	total: 160ms	remaining: 19.8s
8:	learn: 0.5866152	total: 168ms	remaining: 18.5s
9:	learn: 0.5793957	total: 177ms	remaining: 17.5s
10:	learn: 0.5718327	total: 187ms	remaining: 16.8s
11:	learn: 0.5663910	total: 197ms	remaining: 16.2s
12:	learn: 0.5617731	total: 204ms	remaining: 15.5s
13:	learn: 0.5489639	total: 211ms	remaining: 14.9s
14:	learn: 0.5446798	total: 219ms	remaining: 14.4s
15:	learn: 0.5406541	total: 227ms	remaining: 14s
16:	learn: 0.5369813	total: 235ms	remaining: 13.6s
17:	learn: 0.5341707	total: 248ms	remaining: 13.5s
18:	learn: 0.5298519	total: 259ms	remaining: 13.4s
19:	learn:

In [4]:
avg_probs_EC1

array([0.8183708 , 0.24381802, 0.99064874, ..., 0.82539311, 0.21480521,
       0.98031165])

In [5]:
# Repeat the process for EC2
catboost.fit(X_train, y_train['EC2'])
lightgbm.fit(X_train, y_train['EC2'])

catboost_probs = catboost.predict_proba(X_val)[:, 1]
lightgbm_probs = lightgbm.predict_proba(X_val)[:, 1]

avg_probs_EC2 = (catboost_probs + lightgbm_probs) / 2

Learning rate set to 0.02963
0:	learn: 0.6794644	total: 29.9ms	remaining: 29.8s
1:	learn: 0.6477709	total: 37.6ms	remaining: 18.8s
2:	learn: 0.6374580	total: 46.8ms	remaining: 15.5s
3:	learn: 0.6257815	total: 51.8ms	remaining: 12.9s
4:	learn: 0.6157643	total: 57.8ms	remaining: 11.5s
5:	learn: 0.6054344	total: 63.5ms	remaining: 10.5s
6:	learn: 0.5819814	total: 70.2ms	remaining: 9.95s
7:	learn: 0.5736398	total: 77ms	remaining: 9.55s
8:	learn: 0.5673488	total: 82.5ms	remaining: 9.08s
9:	learn: 0.5599370	total: 87.9ms	remaining: 8.7s
10:	learn: 0.5537082	total: 93.1ms	remaining: 8.37s
11:	learn: 0.5485737	total: 102ms	remaining: 8.39s
12:	learn: 0.5433404	total: 110ms	remaining: 8.36s
13:	learn: 0.5270719	total: 118ms	remaining: 8.28s
14:	learn: 0.5208839	total: 126ms	remaining: 8.29s
15:	learn: 0.5160075	total: 135ms	remaining: 8.31s
16:	learn: 0.5126798	total: 145ms	remaining: 8.41s
17:	learn: 0.5089540	total: 157ms	remaining: 8.54s
18:	learn: 0.5054509	total: 165ms	remaining: 8.5s
19:	l

In [6]:
# Combine the probabilities for EC1 and EC2
avg_probs = np.vstack((avg_probs_EC1, avg_probs_EC2)).T

# Create a submission dataframe
submission = pd.DataFrame(avg_probs, columns=['EC1', 'EC2'])
submission['id'] = test.index
submission = submission[['id', 'EC1', 'EC2']]

# Save the submission dataframe to a csv file
submission1.to_csv('submission.csv', index=False)

ValueError: Length of values (9893) does not match length of index (2968)