In [None]:
# Importing dependecies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# importing train data
df = pd.read_csv('train_s3TEQDk.csv')
df.columns

In [None]:
# Data processing
# Creating features X and targets Y
X = df.drop('Is_Lead', axis='columns')
y = df['Is_Lead']

### Building pipeline

In [None]:
# function that drops 'ID' column and keeps other columns
def categorical_values(dataset):
    """function that drops 'ID' column and keeps other columns"""
    if 'ID' in dataset.columns:
        X = dataset.drop('ID', axis=1)
        categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
        return categorical_cols

In [None]:
# Defining categorical and numerical columns
categorical_cols = categorical_values(X)
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [None]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.NaN,strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(missing_values=np.NaN,strategy='mean')),
    ('scaling', RobustScaler())
    ])

In [None]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer( remainder= 'drop',
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

#### Incorperating model in pipeline

In [None]:
# Getting random forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10, random_state=0)

In [None]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

### Train test split

In [None]:
# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

### model building

In [None]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

In [None]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

#### importing metrics

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
print('classification report', classification_report(y_test, preds))

In [None]:
print('confusion matrix', confusion_matrix(y_test, preds))

#### Trying out XGBOOST 

In [None]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier()

In [None]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline_xgboost = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_xgb)
                             ])

In [None]:
# Preprocessing of training data, fit model 
my_pipeline_xgboost.fit(X_train, y_train)

In [None]:
# Preprocessing of validation data, get predictions
preds_xgboost = my_pipeline_xgboost.predict(X_test)

#### evaluating xgboost results

In [None]:
print('classification report', classification_report(y_test, preds_xgboost))

In [None]:
print('confusion matrix', confusion_matrix(y_test, preds))

#### Tunning XGBOOST model with pycaret

In [None]:
# importing xgboost
from pycaret.classification import *
import xgboost

In [None]:
# importing the data sets
df_train = pd.read_csv('train_s3TEQDk.csv')
df_test = pd.read_csv('test_mSzZ8RL.csv')

In [None]:
# intialissaing the pycaret model
clf1 = setup(data= df_train, target='Is_Lead',
            ignore_features=['ID'],
            categorical_features=['Gender','Region_Code', 'Occupation',
                                  'Channel_Code','Vintage',
                                  'Credit_Product',],
            numeric_features=['Age','Avg_Account_Balance'])

In [None]:
# Creating a xgboost model
xgboost = create_model('xgboost')

In [None]:
# Fine tune the created model
tuned_xgboost = tune_model(xgboost)

In [None]:
# print the changed hyperparameters
print('lgbm',xgboost)
print('tuned_lgbm', tuned_xgboost)

#### Saving prediction in csv file

In [None]:
predictions = predict_model(tuned_xgboost, data=df_test, raw_score=True)
predictions.to_csv('xgboost.csv', index=False)
predictions[['ID', 'Score_1']].to_csv('sample_xgboost.csv', index=False)
