# Training Model with Target 1

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression as LR
import lightgbm as LGBM

#### Random Forest Classifier

In [None]:
clf = RFC(random_state=42)
params = {
    'max_depth': [2, 3, 5, 8, 10, 12, 20],
    'min_samples_leaf': [5, 10, 20, 50, 80],
    'criterion': ["gini", "entropy"]
}
clf= RandomizedSearchCV(clf, params,verbose=10,n_jobs=-1)
clf.fit(x_train_smote1, y_train_smote1)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
print(clf.best_estimator_)

RandomForestClassifier(max_depth=20, min_samples_leaf=5, random_state=42)


In [None]:
print(clf.best_score_)

0.6992893517644496


In [None]:
clf = RFC(criterion='entropy', max_depth=20,
                            min_samples_leaf=5, random_state=42)

clf.fit(x_train_smote1,y_train_smote1)
clf = CalibratedClassifierCV(clf, method="sigmoid")
clf.fit(x_train_smote1,y_train_smote1)

predict_y =clf.predict(x_train_smote1)
train_RFC_target1 = f1_score(y_train_smote1, predict_y, average='macro')
print ('The train f1_macro is:',f1_score(y_train_smote1, predict_y,average='macro'))

predict_y = clf.predict(x_val_smote1)
val_RFC_target1 = f1_score(y_val_smote1, predict_y, average='macro')
print('The cross validation f1_macro is:',f1_score(y_val_smote1, predict_y,average='macro'))

The train f1_macro is: 0.885287817946931
The cross validation f1_macro is: 0.6903847502158951


####Decision Tree classifier

In [None]:
clf = DecisionTreeClassifier(random_state=42)
params = {
    'max_depth': [2, 3, 5, 8, 10, 12, 20],
    'min_samples_leaf': [5, 10, 20, 50, 80],
    'criterion': ["gini", "entropy"]
}
clf= GridSearchCV(clf, params, verbose=10, n_jobs=-1)
clf.fit(x_train_smote1, y_train_smote1)

Fitting 5 folds for each of 70 candidates, totalling 350 fits


In [None]:
print(clf.best_estimator_)

DecisionTreeClassifier(max_depth=12, min_samples_leaf=5, random_state=42)


In [None]:
print(clf.best_score_)

0.6348534871562654


In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=12,
                            min_samples_leaf=20, random_state=42)
clf.fit(x_train_smote1,y_train_smote1)
clf = CalibratedClassifierCV(clf, method="sigmoid")
clf.fit(x_train_smote1,y_train_smote1)
predict_y =clf.predict(x_train_smote1)

train_DT_target1 = f1_score(y_train_smote1, predict_y, average='macro')
print ('The train f1_macro is:',f1_score(y_train_smote1, predict_y,average='macro'))

predict_y = clf.predict(x_val_smote1)
val_DT_target1 = f1_score(y_val_smote1, predict_y, average='macro')
print('The cross validation f1_macro is:',f1_score(y_val_smote1, predict_y,average='macro'))

The train f1_macro is: 0.7555122895818899
The cross validation f1_macro is: 0.6468112964305858


####LGBM

In [None]:
clf = LGBM.LGBMClassifier(random_state=42)     # to balance the data setting is_unbalance to true

# setting parameters
params = {'max_depth':[4, 5, 6, 8, 10, 12], # max_depth shallow because it was overfitting the model very badly
          'num_leaves': [40, 60, 80, 100, 120],
          'min_samples_leaf': [10, 15, 20, 25, 30, 40],
          'learning_rate': [0.02, 0.05, 0.1, 0.2],
          'n_estimators': [50, 100, 150, 200, 300]} # keeping learning_rate low to penalize it more and to avoid overfit

# we use GridSearchCv method
# we take CV=3 to to get better results
clf= RandomizedSearchCV(clf, params, cv=10, scoring= 'f1_macro', return_train_score=True, n_jobs = -1)
clf.fit(x_train_smote1, y_train_smote1)        # we will fit vectorized x_train_tfidf to train and cross validating our data

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6309
[LightGBM] [Info] Number of data points in the train set: 23667, number of used features: 30
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [None]:
# printing best parameters
clf.best_params_

{'num_leaves': 60,
 'n_estimators': 300,
 'min_samples_leaf': 40,
 'max_depth': 12,
 'learning_rate': 0.05}

In [None]:
# now we have got best hyperparameter from train and cross validation data
clf = LGBM.LGBMClassifier(learning_rate= 0.05,
                          max_depth= 12,
                          min_samples_leaf = 80,
                          num_leaves= 250,
                          random_state=42)
# we will again train our train data with best hyperparameter
clf.fit(x_train_smote1,y_train_smote1)
clf = CalibratedClassifierCV(clf, method="sigmoid")
clf.fit(x_train_smote1,y_train_smote1)

predict_y = clf.predict(x_train_smote1)
train_LGBM_target1 = f1_score(y_train_smote1, predict_y, average='macro')
print ('The train f1_macro is:', f1_score(y_train_smote1, predict_y, average='macro'))

predict_y = clf.predict(x_val_smote1)
val_LGBM_target1 = f1_score(y_val_smote1, predict_y, average='macro')
print('The cross validation f1_macro is:',f1_score(y_val_smote1, predict_y, average='macro'))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6309
[LightGBM] [Info] Number of data points in the train set: 23667, number of used features: 30
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6310
[LightGBM] [Info] Number of data points in the train set: 18933, number of used features: 30
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6275
[LightGBM] [Info] Numb

####AutoML

In [None]:
!pip install flaml
from flaml import AutoML

Collecting flaml
  Downloading FLAML-2.0.2-py3-none-any.whl (295 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/295.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m286.7/295.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.3/295.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: flaml
Successfully installed flaml-2.0.2


In [None]:
clf = AutoML()
automl_settings = {
    "time_budget": 1600,  # in seconds
    "metric": 'macro_f1',
    "task": 'classification'
}

clf.fit(x_train_smote1, np.array(y_train_smote1).ravel(),
        **automl_settings)   # we will again train our train data with best hyperparameter

[flaml.automl.logger: 09-09 15:04:12] {1679} INFO - task = classification
[flaml.automl.logger: 09-09 15:04:12] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 09-09 15:04:12] {1788} INFO - Minimizing error metric: 1-macro_f1
[flaml.automl.logger: 09-09 15:04:12] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 09-09 15:04:12] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 09-09 15:04:14] {2344} INFO - Estimated sufficient time budget=23826s. Estimated necessary time budget=549s.
[flaml.automl.logger: 09-09 15:04:14] {2391} INFO -  at 2.7s,	estimator lgbm's best error=0.4851,	best estimator lgbm's best error=0.4851
[flaml.automl.logger: 09-09 15:04:14] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 09-09 15:04:15] {2391} INFO -  at 3.4s,	estimator lgbm's best error=0.4688,	best estimator lgbm's best error=0.4688
[flaml.automl.logger: 09-09 15:0

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 09-09 15:30:43] {2391} INFO -  at 1591.9s,	estimator lrl1's best error=0.4512,	best estimator extra_tree's best error=0.2493
[flaml.automl.logger: 09-09 15:30:43] {2218} INFO - iteration 136, current learner rf
[flaml.automl.logger: 09-09 15:30:48] {2391} INFO -  at 1596.0s,	estimator rf's best error=0.2996,	best estimator extra_tree's best error=0.2493
[flaml.automl.logger: 09-09 15:30:48] {2218} INFO - iteration 137, current learner rf
[flaml.automl.logger: 09-09 15:30:52] {2391} INFO -  at 1600.0s,	estimator rf's best error=0.2982,	best estimator extra_tree's best error=0.2493
[flaml.automl.logger: 09-09 15:30:58] {2627} INFO - retrain extra_tree for 6.6s
[flaml.automl.logger: 09-09 15:30:58] {2630} INFO - retrained model: ExtraTreesClassifier(criterion='entropy', max_features=0.27714026209389997,
                     max_leaf_nodes=4428, n_estimators=186, n_jobs=-1,
                     random_state=12032022)
[flaml.automl.logger: 09-09 15:30:58] {1930} INFO -

In [None]:
predict_y = clf.predict(x_train_smote1)
train_AutoML_target1 = f1_score(y_train_smote1, predict_y, average='macro')
print ('The train f1_macro is:', f1_score(y_train_smote1, predict_y,average='macro'))

predict_y = clf.predict(x_val_smote1)
val_AutoML_target1 = f1_score(y_val_smote1, predict_y, average='macro')
print('The cross validation f1_macro is:',f1_score(y_val_smote1, predict_y,average='macro'))

The train f1_macro is: 0.9803958758354012
The cross validation f1_macro is: 0.6546255608903655
