<a href="https://colab.research.google.com/github/jackit940/netflix_appetency/blob/main/netflix_appetency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1 id="import" style="color:white;background:#0076a8;padding:8px;border-radius:8px">Import libraries</h1>

<h1 id="Netflix" style="color:white;background:#0076a8;padding:8px;border-radius:8px"> Netflix Appetency - Identify consumer willing to subscribe </h1>

<center><img src="https://play-lh.googleusercontent.com/0rgPYj0GwZ6txpYZrzoMdhwzqg7vY6C9B-Ol7jlaz-Ox2rgpD4Tr82ZgDqkirrEohbGm"></center>

<h2>Goal:</h2>
    
Classify consumers according to their appetite to subscribe to Netflix.

<h2>Metric:</h2>

The metric used is AUC

<h2>Data:</h2>

* train.csv - the training set. it consists of an id column, the customers features, and a target column: target.
* test.csv - the test set. it consists of everything in train.csv except target.
* sample_submission.csv - a sample submission file in the correct format target=1 means that the customer subscribes to Netflix

For reasons of confidentiality, the data is anonymized and augmented.


In [212]:
#!pip install kaggle
#!pip install catboost
#!pip install optuna
#from catboost import CatBoostClassifier
#import optuna

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, MinMaxScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, KFold,StratifiedKFold,cross_val_predict, train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
import re
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.stats import chi2_contingency

import warnings
warnings.filterwarnings('ignore')
sns.set()

In [2]:
from google.colab import drive
ROOT = '/content/gdrive'
drive.mount(ROOT)

Mounted at /content/gdrive


In [None]:
#files.upload()
#!ls -1ha kaggle.json
#!mkdir -p ~/.kaggle
#!cp kaggle.json ~/.kaggle
#!chmod 600 ~/.kaggle/kaggle.json
#!kaggle competitions download -c netflix-appetency
#!unzip /content/netflix-appetency.zip
#!mv test.csv "./drive/MyDrive/Colab Notebooks/test.csv"
#!mv train.csv "./drive/MyDrive/Colab Notebooks/train.csv"
#!mv sample_submission.csv "./drive/MyDrive/Colab Notebooks/sample_submission.csv"

Downloading netflix-appetency.zip to /content
 69% 23.0M/33.2M [00:00<00:00, 118MB/s] 
100% 33.2M/33.2M [00:00<00:00, 113MB/s]
Archive:  /content/netflix-appetency.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [202]:
train = pd.read_csv("./gdrive/MyDrive/Colab Notebooks/train.csv")
test = pd.read_csv("./gdrive/MyDrive/Colab Notebooks/test.csv")
submission = pd.read_csv("./gdrive/MyDrive/Colab Notebooks/sample_submission.csv")

<h1 id="missings" style="color:white;background:#0076a8;padding:8px;border-radius:8px"> EDA </h1>

## 1. size and shape

train.shape, test.shape

## 2. Null vlaues

null_value = train.iloc[:,2:].isnull().sum()/train.shape[0]

null_value.describe(percentiles=[0.1,.2,.3,.4,.5,.6,.7,.8,.9,0.95,1.0])

## 3. Categorical and Numeraic features

In [203]:
Categorical_Features=train.select_dtypes(include=['object']).columns
Numeric_Features=train.select_dtypes(exclude=['object']).columns[2:]
len(Categorical_Features), len(Numeric_Features)

(92, 415)

train[Categorical_Features].head(1).values

single_cnt_value = pd.Series()
for i in Categorical_Features:
    b = train[i].value_counts()
    c = sum(b==1)
    single_cnt_value[i] = c

single_cnt_value.sort_values(ascending=False).values

max_value = train[Numeric_Features].max(axis=0)
max_value.describe(percentiles=list(np.arange(0,1.1,0.1)))

## 4. Outlier

max_value.sort_values(na_position ='last', ascending=False)[:80].values

## 5. Data skewness

skewness = pd.Series()
for i in train.columns[2:]:
    skewness[i] = train[i].value_counts().max()/train.shape[0]

skewness.hist()

skewness.describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])

<h1 id="missings" style="color:white;background:#0076a8;padding:8px;border-radius:8px"> Feature engineering </h1>

# Define necessary function

In [204]:
feature_list_type = ['Categorical_Features','Numeric_Features']

def update_feature():
  "as features deleted during preprocessing, updates list of feature types and inform what type and how many deleted"
  for i in feature_list_type:
    temp = globals()[i]
    globals()[i] = [j for j in temp if j in train.columns]
    print (i, "===  before : {}, after : {}".format(len(temp),len(globals()[i])))

def train_test(func, *args):
  "apply function to and transform both train and test set"
  print ("before : ",train.shape, test.shape)
  a = func(train,*args)
  b = func(test,*args)
  print ("After : ",train.shape, test.shape)
  return a,b


# 1. Time Series Feature transformed to Time Series Categorical features

In [205]:
def finding_date_format_features(df, features):
  "to find any features indicative of date format"
  match_str = '(\d{2,4}(-|\/|\\|\.| )\d{2}(-|\/|\\|\.|)\d{2,4})+'
  temp = df[features].astype(str).apply(lambda x : x.str.match(match_str).any())
  return Categorical_Features[temp]

def date_split(df, features):
  """transform date feature in objevtive format to date-format and split date-features into year month week day respectively
  and delete orifinal feature"""
  for col in features:
    df[col] = pd.to_datetime(df[col])
    df[col+"_year"]=pd.DatetimeIndex(df[col]).year
    df[col+"_month"]=pd.DatetimeIndex(df[col]).month
    df[col+"_week"]=pd.DatetimeIndex(df[col]).week
    df[col+"_day"]=pd.DatetimeIndex(df[col]).day
    df.drop(col,axis=1, inplace = True)
  return df

mask = train[Categorical_Features].astype(str).apply(lambda x : x.str.match('(\d{2,4}(-|\/|\\|\.| )\d{2}(-|\/|\\|\.| )\d{2,4})+').any())
datetime_Features=train[Categorical_Features].loc[:,mask]
train[datetime_Features.columns]= train[datetime_Features.columns].apply(pd.to_datetime,dayfirst=False)
test[datetime_Features.columns]= test[datetime_Features.columns].apply(pd.to_datetime,dayfirst=False)
datetime_tmp=train.select_dtypes(include=['datetime'])
for col in datetime_tmp.columns:
    train[col+"_year"]=pd.DatetimeIndex(train[col]).year
    train[col+"_month"]=pd.DatetimeIndex(train[col]).month
    train[col+"_week"]=pd.DatetimeIndex(train[col]).week
    train[col+"_day"]=pd.DatetimeIndex(train[col]).day
    test[col+"_year"]=pd.DatetimeIndex(test[col]).year
    test[col+"_month"]=pd.DatetimeIndex(test[col]).month
    test[col+"_week"]=pd.DatetimeIndex(test[col]).week
    test[col+"_day"]=pd.DatetimeIndex(test[col]).day
train.drop(list(datetime_tmp.columns),axis=1,inplace=True)
test.drop(list(datetime_tmp.columns),axis=1,inplace=True)

In [206]:
datetime_Features = finding_date_format_features(train, Categorical_Features)
train, test = train_test(date_split, datetime_Features)

rex = re.compile(".*year|.*month|.*week|.*day")
Categorical_Features_TimeSeries = list(filter(rex.findall, train.columns))
Categorical_Features = [i for i in Categorical_Features if i not in Categorical_Features_TimeSeries]
feature_list_type.append('Categorical_Features_TimeSeries')
update_feature()

before :  (70000, 509) (30000, 508)
After :  (70000, 539) (30000, 538)
Categorical_Features ===  before : 92, after : 82
Numeric_Features ===  before : 415, after : 415
Categorical_Features_TimeSeries ===  before : 40, after : 40


# 2. Categorical Feature to numeric form

train[Categorical_Features].head(1).values

In [207]:
def removing_string_in_categorical_features(df, features):
  "remove character 'c' in categorical features to transform it in numeraic type"
  df[features] = df[features].apply(lambda x : x.str.replace("C",'')).astype(int)
  return df

train, test = train_test(removing_string_in_categorical_features, Categorical_Features)

before :  (70000, 539) (30000, 538)
After :  (70000, 539) (30000, 538)


train[Categorical_Features].head(1).values

# 3. Numeric feature to Caregorical feature

In [214]:
nunique_by_feature = pd.Series()
for i in Numeric_Features:
    nunique_by_feature[i] = train[i].nunique()

nunique_by_feature = nunique_by_feature.sort_values()

In [215]:
nunique_limit = 100

nunique_by_feature.describe(percentiles=[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])

In [216]:
Categorical_Features_in_NumericFeatureList = nunique_by_feature[nunique_by_feature<nunique_limit].index.tolist()
Categorical_Features += Categorical_Features_in_NumericFeatureList
Numeric_Features = [i for i in Numeric_Features if i not in Categorical_Features_in_NumericFeatureList]

update_feature()

Categorical_Features ===  before : 373, after : 373
Numeric_Features ===  before : 124, after : 124
Categorical_Features_TimeSeries ===  before : 40, after : 40


# 4. Non necessary feature deletion

### Single value / All null value feature - Mandatory

In [218]:
nunique_cnt = pd.Series()
for i in train.columns[2:]:
    nunique_cnt[i] = train[i].nunique()

Single_Null_Count_Feature = nunique_cnt[nunique_cnt<2].index

train.drop(Single_Null_Count_Feature, axis =1, inplace=True)
test.drop(Single_Null_Count_Feature, axis =1, inplace=True)

print (len(Single_Null_Count_Feature),' features are delted')
update_feature()

63  features are delted
Categorical_Features ===  before : 373, after : 310
Numeric_Features ===  before : 124, after : 124
Categorical_Features_TimeSeries ===  before : 40, after : 40


### Null value portion - cutoff optional

null_cnt = train.isnull().sum()/train.shape[0]
null_cnt.describe(percentiles=[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])

In [220]:
null_portion_cut_off = 0.2

Feature_With_High_Null_Portion = null_cnt[null_cnt>null_portion_cut_off].index
train.drop(Feature_With_High_Null_Portion, axis=1, inplace = True)
test.drop(Feature_With_High_Null_Portion, axis=1, inplace = True)

print (sum(null_cnt>null_portion_cut_off)," feature are deleted due to null value portion over 20%")
update_feature()

##### Few features can be maintained and used in impuation stage

### Feature dominated by single value out of Numeric Features

single_val_cnt = pd.Series()
for i in train[Numeric_Features]:
    a = train[i].value_counts().max()
    single_val_cnt[i] = a/train.shape[0]

single_val_cnt.describe(percentiles=[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])

single_val_cnt_cut_off = 0.7
print (sum(single_val_cnt>single_val_cnt_cut_off), " features are deleted dut to high dominance by single value")

drop_list_single_val_cnt = single_val_cnt[single_val_cnt > single_val_cnt_cut_off].index

train.drop(drop_list_single_val_cnt, axis=1, inplace = True)
test.drop(drop_list_single_val_cnt, axis=1, inplace = True)
update_feature()

##### Features dominated by signle value can be considered as sparse matrix

### High Cardinalty - cutoff optional for Categorical features

high_car = pd.Series()
for i in Categorical_Features+Categorical_Features_TimeSeries:
    b = train[i].value_counts()
    high_car[i] = sum(b[b==1])

high_car.sort_values(ascending=False)

##### This can be processed as NAN and imputed in the imputation stage or processed as single categoru

### Chi-Square Test of independence for categorical features

In [224]:
train_idx = train[['id','target']]
train.drop(['id','target'], axis=1, inplace = True)
test_idx = test[['id']]
test.drop(['id'], axis=1, inplace = True)

In [225]:
def chi2_check(df, features, alpha = 0.05):
  fail_to_meet_chi2 = pd.Series()
  for i in features:
    ch_score = chi2_contingency(pd.crosstab(train_idx['target'], train[i]))[1]
    if ch_score >= alpha:
      fail_to_meet_chi2[i] = ch_score
  
  return fail_to_meet_chi2

In [228]:
high_ch2_score = chi2_check(train, Categorical_Features+Categorical_Features_TimeSeries, 0.05)
Features_not_met_ch2 = high_ch2_score.index

train.drop(Features_not_met_ch2, axis=1, inplace= True)
test.drop(Features_not_met_ch2, axis=1, inplace= True)
print (len(Features_not_met_ch2), " features are deleted dut to low ch2 score")

update_feature()

# 5. Outlier

# 5.5 Scaling

In [20]:
scaler = MinMaxScaler()
scaler.fit(train)
train_scaled = scaler.transform(train)
test_scaled = scaler.transform(test)

In [21]:
train = pd.DataFrame(data=train_scaled, columns = train.columns)
test = pd.DataFrame(data=test_scaled, columns = train.columns)

# 6. Missing values
* mode and mean
* knn
* MICE

### Null value count

In [222]:
Null_val_cnt = train.isnull().sum()/train.shape[0]
Null_val_cnt.sort_values(ascending=False)

feature_200_month    0.184514
feature_200_year     0.184514
feature_200_day      0.184514
feature_200_week     0.184514
feature_206          0.027671
                       ...   
feature_217          0.000000
feature_216          0.000000
feature_215          0.000000
feature_214          0.000000
id                   0.000000
Length: 438, dtype: float64

### Imputation -mean and mode

In [None]:
for col in Numeric_Features:
    train[col].fillna(train[col].median(),inplace=True)
    test[col].fillna(test[col].median(),inplace=True)

In [None]:
for col in Categorical_Features+Categorical_Features_TimeSeries:
    train[col].fillna(train[col].mode()[0],inplace=True)
    test[col].fillna(test[col].mode()[0],inplace=True)

### Imputation - MICE

In [None]:
train['type'] = 'train'
test['type'] = 'test'
df = pd.concat([train,test])
col_type = df[['type']]
df.drop(['type'],axis=1, inplace = True)

In [None]:
lr_impute = LinearRegression()
imp = IterativeImputer(estimator = lr_impute)
%%time
imp.fit(df)
%%time
transformed = imp.transform(df)

df_mice = pd.DataFrame(transformed, index = df.index, columns = df.columns)
#df_mice.to_csv('./gdrive/MyDrive/Colab Notebooks/netflix_mice_intermediate.csv')
#df_mice = pd.read_csv('./gdrive/MyDrive/Colab Notebooks/netflix_mice_intermediate.csv', index = 'Unnamed: 0')
#df_mice.set_index("Unnamed: 0", inplace=True)
#df_mice.index.name = None
#train['type'] = 'train'
#test['type'] = 'test'
#col_type = pd.concat([train[['type']],test[['type']]])

df_mice['type'] = col_type
train = df_mice[df_mice.type=='train']
test = df_mice[df_mice.type=='test']
#Null_val_cnt = train.isnull().sum()/train.shape[0]
#Null_val_cnt.sort_values(ascending=False)
train.drop(['type'], axis=1, inplace=True)
test.drop(['type'], axis=1, inplace=True)

In [73]:
min_cols = list(train.columns)
min_cols.remove('type')

In [77]:
test = df[df.type=='test'][min_cols]
train = df[df.type=='train'][min_cols]

#### KNN
train['type'] = 'train'
test['type'] = 'test'
df = pd.concat([train,test])
idx_target_type = df[['id','target','type']]
df.drop(['id','target','type'],axis=1, inplace = True)


imputer = KNNImputer(n_neighbors=10)
df_imputer = imputer.fit_transform(df)

df2 = pd.DataFrame(df_imputer, columns = df.columns)
idx_target_type.reset_index(drop=True, inplace=True)
df3 = pd.concat([idx_target_type,df2], axis=1)
df3.to_csv('/kaggle/working/netflix2.csv')
df3 = pd.read_csv('/kaggle/input/lms-jw/netflix2.csv')
df3.drop(['Unnamed: 0'], axis=1, inplace=True)
train_c = df3[df3['type']=='train']
test_c = df3[df3['type']=='test']
train_c.drop(['type'], axis=1, inplace = True)
train_c.reset_index(drop=True, inplace=True)
test_c.drop(['type'], axis=1,inplace=True)
test_c.reset_index(drop=True, inplace=True)
train_c['target'] = train2['target'].astype(int)
test_c.drop(['target'], axis=1, inplace = True)

## Imputation - mean and mode by group

# 7. Correlations

### Correlation between numeric features and the target

In [24]:
corr_df=train.iloc[:,2:].corrwith(train_idx['target']).abs().sort_values(ascending=False)

plt.figure(figsize=(10,10))
print (sum(corr_df>0.08))
sns.barplot(x=corr_df[corr_df>0.08],y=corr_df[corr_df>0.08].index).set_title('Correlation between numeric features and target')
plt.xlabel('Correlation')

### Distribution 

plt.figure(figsize=(10,5))
plt.xlabel('Correlation with target')
plt.ylabel('The number of numeric features')
sns.histplot(corr_df).set_title('Distribution of numeric features in terms of correlation with the target',size=15)
plt.show()

### Features with low correlation

corr_df.describe(percentiles=[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])

In [25]:
corr_cut_off = 0.1

low_corr=corr_df[corr_df<corr_cut_off].index
train.drop(low_corr, axis=1, inplace = True)
test.drop(low_corr, axis=1, inplace = True)
print (len(low_corr), "features are deleted due to low correlation with target")
update_feature()

337 features are deleted due to low correlation with target
Categorical_Features ===  before : 242, after : 28
Numeric_Features ===  before : 110, after : 2
Categorical_Features_TimeSeries ===  before : 18, after : 3


## dealing with multicolinearity

In [26]:
corr_thresh = 0.9

In [27]:
def get_highly_correlated(df,threshold=0.5):
    corr_df = df.corr() # get correlations
    correlated_features = np.where(np.abs(corr_df) > threshold) # select ones above the abs threshold
    correlated_features = [(corr_df.iloc[x,y], x, y) for x, y in zip(*correlated_features) if x != y and x < y] # avoid duplication
    s_corr_list = sorted(correlated_features, key=lambda x: -abs(x[0])) # sort by correlation value
    if s_corr_list == []:
        print("There are no highly correlated features with correlation above", threshold)
    else:
        col1,col2,col3=list(),list(),list()
        for v, i, j in s_corr_list:
            col1.append(corr_df.index[i])
            col2.append(corr_df.columns[j])
            col3.append(v)
        col_list=[col1,col2,col3]
        return pd.DataFrame(col_list, index=['F1', 'F2','Correlation']).T.sort_values(by='Correlation',ascending=False)

F_corr_df= get_highly_correlated(train,threshold=corr_thresh)

high_multicolinearity = list(set(F_corr_df.F1.tolist()))

train.drop(high_multicolinearity, axis=1, inplace=True)
test.drop(high_multicolinearity, axis=1, inplace=True)
print (len(F_corr_df), ' feature are deleted due to multicolinearity')

12  feature are deleted due to multicolinearity


In [28]:
update_feature()

Categorical_Features ===  before : 28, after : 19
Numeric_Features ===  before : 2, after : 1
Categorical_Features_TimeSeries ===  before : 3, after : 3


<h1 id="encode" style="color:white;background:#0076a8;padding:8px;border-radius:8px">  Modeling </h1>

## train test split

In [78]:
X=train
y=train_idx['target']

X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.2,random_state=18)

### Model tuning

In [79]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)
def getScore(clf):
    score = cross_val_score(clf,X_train, y_train, cv=cv, n_jobs = 1, scoring = 'roc_auc')
    return score

#Returns validation score
def get_val_score(model, train_x, train_y):
    val_proba = model.predict_proba(X_valid)[:,1]
    train_proba = model.predict_proba(train_x)[:,1]
    print ("train :",roc_auc_score(train_y,train_proba))
    print ("test :",roc_auc_score(y_valid,val_proba))
    return train_proba, val_proba

In [80]:
RF = RandomForestClassifier(n_estimators = 400, min_samples_split=60)
RF.fit(X_train,y_train)
RF_train_proba, RF_valid_proba = get_val_score(RF,X_train,y_train)

train : 0.8439919260236501
test : 0.7653749373127026


In [81]:
LGBM = LGBMClassifier()
LGBM.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],early_stopping_rounds=200,verbose=100,eval_metric='auc')
LG_train_proba, LG_valid_proba = get_val_score(LGBM,X_train,y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.768211	valid_0's binary_logloss: 0.508239
Did not meet early stopping. Best iteration is:
[39]	valid_0's auc: 0.769653	valid_0's binary_logloss: 0.507354
train : 0.781307586549382
test : 0.769652767414837


In [82]:
CBM = CatBoostClassifier()
CBM.fit(X_train,y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=200,verbose=100)
CB_train_proba, CB_valid_proba = get_val_score(CBM,X_train,y_train)

Learning rate set to 0.085647
0:	learn: 0.6623437	test: 0.6624980	best: 0.6624980 (0)	total: 65.9ms	remaining: 1m 5s
100:	learn: 0.5001179	test: 0.5070823	best: 0.5070521 (99)	total: 1.84s	remaining: 16.4s
200:	learn: 0.4927946	test: 0.5073126	best: 0.5069127 (125)	total: 3.54s	remaining: 14.1s
300:	learn: 0.4858906	test: 0.5079878	best: 0.5069127 (125)	total: 5.24s	remaining: 12.2s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.5069126764
bestIteration = 125

Shrink model to first 126 iterations.
train : 0.7808908724775756
test : 0.7700381994985016


In [83]:
XGB = XGBClassifier()
XGB.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],early_stopping_rounds=200,verbose=100,eval_metric='auc')
XG_train_proba, XG_valid_proba = get_val_score(XGB,X_train,y_train)

[0]	validation_0-auc:0.740378
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.77006
train : 0.7729709848756116
test : 0.7700944896336617


In [None]:
#imputation mean and mode
1st : 76  with minumum feature
2nd : 76.8 78.3 78.2 75.9 with all features and mode&mean
3rd : 77.0 78.2 78.3 77.7 with high null value feature deleted and mode&mean
4th : 77.0 78.4 78.2 77.8 with high null value feature deleted and mode&mean and scaled
5th : 75.9 76.3 76.4 76.3 with minimum features and mice 
6th : 76.9 78.3 78.3 78.0 with all features and mice
7th : 76.5 76.9 77.0 77.0 with min feature and mice and high null value deleted

## Tuning

In [None]:
#LGBM

def objective(trial, X, y):
    param_grid = {
        
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_int("n_estimators", 3000,20000,step=20),
        "num_boost_round": trial.suggest_int("num_boost_round", 500,10000,step=20),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.02, 0.5),
        "num_leaves": trial.suggest_int("num_leaves", 500, 2000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1.0, 12.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1.0, 12.0),
        #"reg_alpha": trial.suggest_loguniform("reg_alpha", 1.0, 15.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 0.1, 10.0),
        #"reg_lambda": trial.suggest_loguniform("reg_lambda", 1.0, 15.0),
        #"min_child_sample": trial.suggest_int("min_child_sample", 40, 120, step=2),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 30, 120, step=2),
        #'scale_pos_weight': trial.suggest_float("scale_pos_weight", 0.1, 0.95, step=0.1),
        'subsample': trial.suggest_loguniform('subsample', 0.3, 0.9),
        #'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 0.9, step=0.2),
        #"gamma": trial.suggest_float("gamma", 0.2, 0.95, step=0.1),
        #"min_child_weight": trial.suggest_float("min_child_weight", 1, 10, step=1),
       
        "bagging_fraction": trial.suggest_float('bagging_fraction', 0.3, 0.8, step=0.2),
        "bagging_freq": trial.suggest_int("bagging_freq", 0,10),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.2),
        'eval_metric': trial.suggest_categorical('eval_metric',["AUC"])
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = LGBMClassifier( **param_grid, random_state=42)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=200,
            verbose=200,

        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = roc_auc_score(y_test, preds[:,1])

    return np.mean(cv_scores)


#Start tuning
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=20)

#Show best parameters
print(f"\tBest auc: {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
study.best_params

#XGB

def objective(trial, X, y):
    param_grid = {
        
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_int("n_estimators", 5000,20000,step=10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.02, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 10, 2000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1.0, 15.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1.0, 15.0),
        #"reg_alpha": trial.suggest_loguniform("reg_alpha", 1.0, 15.0),
        #"reg_lambda": trial.suggest_loguniform("reg_lambda", 1.0, 15.0),
        "min_child_sample": trial.suggest_int("min_child_sample", 50, 150, step=2),
        #'scale_pos_weight': trial.suggest_float("scale_pos_weight", 0.1, 0.95, step=0.1),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1.0),
        #'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        #"gamma": trial.suggest_float("gamma", 0.2, 0.95, step=0.1),
        #"min_child_weight": trial.suggest_float("min_child_weight", 1, 10, step=1),
       
        "bagging_fraction": trial.suggest_float('bagging_fraction', 0.3, 0.8, step=0.1)
        "bagging_freq": trial.suggest_int("bagging_freq", 0,10, step=0.2),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = XGBClassifier( **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=200,
            verbose=200,

        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = roc_auc_score(y_test, preds[:,1])

    return np.mean(cv_scores)


#Start tuning
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=20)

#Show best parameters
print(f"\tBest auc: {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

## Models with tuned params

In [None]:
# my result
LGBM_params = {'n_estimators': 11840,
 'num_boost_round': 4780,
 'learning_rate': 0.020752194769577355,
 'num_leaves': 1240,
 'max_depth': 5,
 'lambda_l1': 1.144507966666114,
 'lambda_l2': 9.092937992824332,
 'l2_leaf_reg': 5.4086034155921965,
 'min_data_in_leaf': 56,
 'subsample': 0.3470688170169624,
 'colsample_bylevel': 0.3,
 'bagging_fraction': 0.7,
 'bagging_freq': 5,
 'feature_fraction': 0.4,
 'eval_metric': 'AUC'}

CATBOOST_params = {'num_boost_round': 9480,
 'learning_rate': 0.03491087345019091,
 'max_depth': 4,
 'l2_leaf_reg': 1.617381591916611,
 'min_data_in_leaf': 74,
 'subsample': 0.4719790044645917,
 'colsample_bylevel': 0.9,
 'random_state': 42,
 'eval_metric': 'AUC'}

In [None]:
LGBM = LGBMClassifier(**LGBM_params)

## Validation score function

In [29]:
#Returns validation score
def get_val_score2(model):
    val_pred = model.predict_proba(X_valid)
    return roc_auc_score(y_valid,val_pred[:,1])

## LGBM Classifier

In [30]:
LGBM.fit(X_train,y_train,
         eval_set=[(X_valid, y_valid)],
         early_stopping_rounds=200,verbose=100,eval_metric='auc')
#Get score
print("="*20)
print("LGBM Validation AUC : ", get_val_score2(LGBM))

NameError: ignored

In [None]:
LGBM_pred=LGBM.predict_proba(test)

In [None]:
test_predictions = LGBM_pred[:,1]

### Create submission file

In [None]:
sample['target']=test_predictions
sample.to_csv('submission.csv',index=False)
sample.head(10)

### Credits


https://towardsdatascience.com/categorical-feature-selection-via-chi-square-fc558b09de43

https://towardsdatascience.com/kagglers-guide-to-lightgbm-hyperparameter-tuning-with-optuna-in-2021-ed048d9838b5

https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
