In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.13


In [2]:
from catboost import CatBoostClassifier,Pool
from catboost.utils import eval_metric

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


In [3]:
!ls

 Archives	 catboost_info	'Colab Notebooks'   EasyVisa.html   PGP_AI_ML_GREAT_LEARNING
 AutoViz_Plots	 Certificates	 EasyVisa.csv	    Others	    SWEETVIZ_REPORT.html


# Catboost package

In [7]:
def modelExecution():
    data = pd.read_csv('EasyVisa.csv')
    print(data.shape)

    # Map the case_status to binary values (Certified=1, Denied=0)
    data['case_status'] = data['case_status'].map({'Certified': 1, 'Denied': 0})

    # Define feature columns and target column
    features = data.drop(columns=['case_id', 'case_status'])  # Drop unnecessary columns
    target = data['case_status']

    # Convert categorical features to string type for CatBoost
    categorical_features = ['continent', 'education_of_employee', 'region_of_employment', 'unit_of_wage', 'full_time_position','has_job_experience','requires_job_training']
    for col in categorical_features:
        features[col] = features[col].astype(str)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    #model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, cat_features=categorical_features, verbose=50)
    #model.fit(X_train, y_train)
    df_overallMetrics = pd.DataFrame()

    model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3,
    border_count=32,
    loss_function='Logloss',
    random_strength=1,
    bagging_temperature=1,
    od_type='Iter',
    od_wait=50,
    one_hot_max_size=10,
    nan_mode='Min',
    boosting_type='Ordered',
    bootstrap_type='Bayesian',
    use_best_model=True,
    task_type='CPU'
        )

    model.fit(X_train, y_train,cat_features=categorical_features, eval_set=(X_test,y_test),verbose=50)

    important_score = model.get_feature_importance()
    feature_name = features.columns
    important_features_df = pd.DataFrame({'Features':feature_name,'Important score':important_score})
    important_features_df = important_features_df.sort_values(by='Important score', ascending=False)
    print(f'Feature Names: {important_features_df}')

    print(f'Best Estimator: {model._estimator_type}')

    pred_probablity = model.predict_proba(X_test)[:,1]
    pred_threshold = (pred_probablity > 0.5).astype(int)

    df_overallMetrics = pd.concat([df_overallMetrics,model_prediction(model,X_test,y_test,'Test dataset')])
    df_overallMetrics = pd.concat([df_overallMetrics,model_prediction(model,X_train,y_train,'Train dataset')])


    param_grid = { 'iterations': [100, 500, 1000],
                    'learning_rate': [0.001,0.01, 0.1],
                     'depth': [4, 6, 10],
                     'l2_leaf_reg': [1, 3, 5,7,9,10],
                     'bagging_temperature': [0.1, 1.0, 10.0]
                 }
    # Perform Grid Search
    random_model = CatBoostClassifier(random_state=42)
    train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)
    # random_search = random_model.randomized_search(param_distributions=param_grid,
    #                                            X=train_pool,
    #                                            y=None,cv=3,verbose=False,
    #                                            partition_random_seed=42,calc_cv_statistics=True,
    #                                            refit=True,shuffle=True,stratified=False,train_size=0.8,
    #                                            plot=True)

    random_search = random_model.grid_search(param_grid=param_grid,
                                            X=train_pool,
                                            y=None,cv=5,verbose=True,plot=True,refit=True,partition_random_seed=42,train_size=0.8,calc_cv_statistics=True)

    best_param = random_search['params']
    print(f'Best parameter: {best_param}')
    tuned_model = CatBoostClassifier(iterations=best_param['iterations'],depth = best_param['depth'],learning_rate = best_param['learning_rate'],
                                     l2_leaf_reg = best_param['l2_leaf_reg'],bagging_temperature = best_param['bagging_temperature'])
    tuned_model.fit(X_train,y_train,cat_features=categorical_features,early_stopping_rounds=50,verbose=2)



    # best_model = random_search['cv_results']['model'][-1]


    # print(f'Best model: {best_model}')


    grid_search_pred_probablity = tuned_model.predict_proba(X_test)[:,1]
    pred_threshold = (grid_search_pred_probablity > 0.5).astype(int)

    df_overallMetrics = pd.concat([df_overallMetrics,model_prediction(tuned_model,X_test,y_test,'RandomSearch Test dataset')])
    df_overallMetrics = pd.concat([df_overallMetrics,model_prediction(tuned_model,X_train,y_train,'Random search Train dataset')])

    return model,random_search,df_overallMetrics

In [8]:
def model_prediction(model, df_feature, target_df,dftype):
    pred_probablity = model.predict_proba(df_feature)[:,1]
    pred_threshold = (pred_probablity > 0.5).astype(int)

    accuracy_val = accuracy_score(target_df, pred_threshold)
    precision_val = precision_score(target_df, pred_threshold)
    f1_val = f1_score(target_df, pred_threshold)
    recall_val = recall_score(target_df,pred_threshold)
    report_class_val = classification_report(target_df,pred_threshold)

    print(f'{dftype} Accuracy: {accuracy_val}')
    print(f'{dftype} Precision: {precision_val}')
    print(f'{dftype} Recall: {recall_val}')
    print(f'{dftype} f1: {f1_val}')
    print(f'{dftype} classification report: {report_class_val}')

    df_metrics = pd.DataFrame({'Type':[dftype], 'Accuracy':[accuracy_val],'Precision':[precision_val],'Recall':[recall_val],'F1-Score':[f1_val]})
    return df_metrics

In [19]:
df_overallMetrics

Unnamed: 0,Type,Accuracy,Precision,Recall,F1-Score
0,Test dataset,0.754121,0.778816,0.882094,0.827244
0,Train dataset,0.758929,0.785064,0.880076,0.829859
0,RandomSearch Test dataset,0.755691,0.781609,0.879741,0.827777
0,Random search Train dataset,0.756917,0.785422,0.875229,0.827898


In [9]:
model,random_search,df_overallMetrics = modelExecution()

(25480, 12)
0:	learn: 0.6586310	test: 0.6574748	best: 0.6574748 (0)	total: 101ms	remaining: 1m 41s
50:	learn: 0.5112301	test: 0.5076700	best: 0.5076700 (50)	total: 3.89s	remaining: 1m 12s
100:	learn: 0.5036654	test: 0.5057242	best: 0.5055059 (95)	total: 6.03s	remaining: 53.7s
150:	learn: 0.4994306	test: 0.5047391	best: 0.5047195 (147)	total: 7.75s	remaining: 43.6s
200:	learn: 0.4960961	test: 0.5045743	best: 0.5043963 (186)	total: 8.73s	remaining: 34.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5043962871
bestIteration = 186

Shrink model to first 187 iterations.
Feature Names:                 Features  Important score
1  education_of_employee        39.848221
2     has_job_experience        16.442672
6   region_of_employment        14.772284
0              continent         9.549741
8           unit_of_wage         5.215409
9     full_time_position         4.598441
7        prevailing_wage         3.236414
5            yr_of_estab         2.424119
4        no_

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
629:	learn: 0.4790235	test: 0.5180213	best: 0.5179566 (607)	total: 52.9s	remaining: 31.1s
630:	learn: 0.4789645	test: 0.5180202	best: 0.5179566 (607)	total: 53.1s	remaining: 31s
631:	learn: 0.4788625	test: 0.5180135	best: 0.5179566 (607)	total: 53.2s	remaining: 31s
632:	learn: 0.4788310	test: 0.5180235	best: 0.5179566 (607)	total: 53.4s	remaining: 30.9s
633:	learn: 0.4787432	test: 0.5180316	best: 0.5179566 (607)	total: 53.5s	remaining: 30.9s
634:	learn: 0.4787296	test: 0.5180320	best: 0.5179566 (607)	total: 53.7s	remaining: 30.8s
635:	learn: 0.4787296	test: 0.5180320	best: 0.5179566 (607)	total: 53.7s	remaining: 30.7s
636:	learn: 0.4785953	test: 0.5180322	best: 0.5179566 (607)	total: 53.9s	remaining: 30.7s
637:	learn: 0.4785302	test: 0.5180508	best: 0.5179566 (607)	total: 54s	remaining: 30.6s
638:	learn: 0.4784875	test: 0.5180485	best: 0.5179566 (607)	total: 54.2s	remaining: 30.6s
639:	learn: 0.4784672	test: 0.5180495	bes

In [15]:
random_search.keys()

dict_keys(['params', 'cv_results'])

In [16]:
random_search['params']

{'bagging_temperature': 0.1,
 'depth': 10,
 'learning_rate': 0.1,
 'l2_leaf_reg': 9,
 'iterations': 100}

In [17]:
random_search['cv_results'].keys()

dict_keys(['iterations', 'test-Logloss-mean', 'test-Logloss-std', 'train-Logloss-mean', 'train-Logloss-std'])

In [20]:
df_overallMetrics

Unnamed: 0,Type,Accuracy,Precision,Recall,F1-Score
0,Test dataset,0.754121,0.778816,0.882094,0.827244
0,Train dataset,0.758929,0.785064,0.880076,0.829859
0,RandomSearch Test dataset,0.755691,0.781609,0.879741,0.827777
0,Random search Train dataset,0.756917,0.785422,0.875229,0.827898


# LAZYPREDICT

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
def LazymodelExecution():
    data = pd.read_csv('EasyVisa.csv')
    print(data.shape)

    # Map the case_status to binary values (Certified=1, Denied=0)
    data['case_status'] = data['case_status'].map({'Certified': 1, 'Denied': 0})

    # Define feature columns and target column
    features = data.drop(columns=['case_id', 'case_status'])  # Drop unnecessary columns
    target = data['case_status']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
    models,predictions = clf.fit(X_train, X_test, y_train, y_test)
    print(models)


LazymodelExecution()

(25480, 12)


 97%|█████████▋| 31/32 [02:50<00:04,  4.55s/it]

[LightGBM] [Info] Number of positive: 13617, number of negative: 6767
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002592 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 20384, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.668024 -> initscore=0.699261
[LightGBM] [Info] Start training from score 0.699261


100%|██████████| 32/32 [02:50<00:00,  5.34s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
NuSVC                              0.76               0.70     0.70      0.75   
LGBMClassifier                     0.76               0.70     0.70      0.75   
SVC                                0.76               0.69     0.69      0.75   
BaggingClassifier                  0.71               0.67     0.67      0.71   
LogisticRegression                 0.74               0.67     0.67      0.73   
RandomForestClassifier             0.72               0.67     0.67      0.72   
LinearDiscriminantAnalysis         0.74               0.67     0.67      0.73   
CalibratedClassifierCV             0.74               0.67     0.67      0.73   
ExtraTreesClassifier               0.72               0.66     0.66      0.71   
AdaBoostClassifier                 0.74               0.66     0.66      0.72   
LabelSpreading              




# SweetViz


In [None]:
pip install sweetviz

Collecting sweetviz
  Downloading sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sweetviz
Successfully installed sweetviz-2.3.1


In [None]:
import sweetviz as sv

In [None]:
def SweetVizModelExecution():
    data = pd.read_csv('EasyVisa.csv')
    print(data.shape)

    my_report = sv.analyze(data)
    my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"


SweetVizModelExecution()

(25480, 12)


                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


#AutoViz

In [None]:
!pip install autoviz # Install the autoviz package

Collecting autoviz
  Downloading autoviz-0.1.905-py3-none-any.whl.metadata (14 kB)
Collecting emoji (from autoviz)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting pyamg (from autoviz)
  Downloading pyamg-5.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting xgboost<1.7,>=0.82 (from autoviz)
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting pandas-dq>=1.29 (from autoviz)
  Downloading pandas_dq-1.29-py3-none-any.whl.metadata (19 kB)
Collecting hvplot>=0.9.2 (from autoviz)
  Downloading hvplot-0.11.2-py3-none-any.whl.metadata (15 kB)
Downloading autoviz-0.1.905-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.5/67.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hvplot-0.11.2-py3-none-any.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.9/161.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hD

In [None]:
import pandas as pd
from autoviz import AutoViz_Class

AV = AutoViz_Class()

In [None]:


data = pd.read_csv('EasyVisa.csv')
df = pd.DataFrame(data)

dft = AV.AutoViz(
    "",
    sep=",",
    depVar="",
    dfte=df,
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=150000,
    max_cols_analyzed=30,
    save_plot_dir=None
)

FileNotFoundError: [Errno 2] No such file or directory: 'EasyVisa.csv'

In [None]:
data = pd.read_csv('EasyVisa.csv')
print(data.shape)
filename = "EasyVisa.csv"
target_variable = "case_status"
df = pd.read_csv(filename)
dft = AV.AutoViz(
    filename,
    sep=",",
    depVar=target_variable,
    dfte=df,
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=150000,
    max_cols_analyzed=30,
    save_plot_dir=None
)

(25480, 12)
Shape of your Data Set loaded: (25480, 12)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  1
    Number of Integer-Categorical Columns =  2
    Number of String-Categorical Columns =  4
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  3
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  1
    Number of Columns to Delete =  0
    11 Predictors classified...
        1 variable(s) removed since they were ID or low-information variables
        List of variables removed: ['case_id']

################ Binary_Classification pro



Saving scatterplots in HTML format
                                     





Saving distplots_cats in HTML format


Saving distplots_nums in HTML format


KDE plot is erroring due to problems with DynamicMaps. Hence it is skipped


Saving violinplots in HTML format


No date vars could be found in data set


Saving heatmaps in HTML format


Saving cat_var_plots in HTML format
                                               



Time to run AutoViz (in seconds) = 8


In [None]:
filename = "EasyVisa.csv"
target_variable = "case_status"
custom_plot_dir = "Colab Notebooks"

dft = AV.AutoViz(
    filename,
    sep=",",
    depVar=target_variable,
    dfte=None,
    header=0,
    verbose=2,
    lowess=False,
    chart_format="bokeh",
    max_rows_analyzed=150000,
    max_cols_analyzed=30,
    save_plot_dir=custom_plot_dir
)

Output hidden; open in https://colab.research.google.com to view.