# Data Pre-Processing

#### Import Packages and CSV

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import mlflow
import mlflow.sklearn
import dagshub
warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)
# Create Dataframe
df = pd.read_csv("EasyVisa.csv")
# Print shape of dataset
print(df.shape)

In [None]:
mlflow.set_tracking_uri("https://dagshub.com/kalehariprasad/Visa-approval-prediction.mlflow")
dagshub.init(repo_owner='kalehariprasad', repo_name='Visa-approval-prediction', mlflow=True)


## Data Cleaning

### Handling Missing values

* Handling Missing values 
* Handling Duplicates
* Check data type
* Understand the dataset

#### Check Null Values

In [50]:
##these are the features with nan value
features_with_na=[features for features in df.columns if df[features].isnull().sum()>=1]
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean()*100,5), '% missing values')

In [None]:
features_with_na

* **There are no null values in the dataset**

### 3.2 Other Data Cleaning steps

**Handling Duplicates**

In [None]:
df.duplicated().sum()

* **No Duplicates in the dataset**

**Remove case_id from the dataset as it cannot used in Model Training**

In [53]:
df.drop('case_id', inplace=True, axis=1)

# Feature Engineering

## Feature Extraction

In [None]:
df.head()

In [55]:
# importing date class from datetime module
from datetime import date
  
# creating the date object of today's date
todays_date = date.today()
current_year= todays_date.year

In [None]:
current_year

**Subtract current year with year of estab to get company's age**

In [57]:
df['company_age'] = current_year-df['yr_of_estab']

In [None]:
df.head()

In [59]:
df.drop('yr_of_estab', inplace=True, axis=1)

### Type of Features

**Numeric Features**

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

**Categorical Features**

In [None]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

**Discrete features**

In [None]:
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))

**Continues Features**

In [None]:
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

### Split X and Y

* **Split Dataframe to X and y**
* **Here we set a variable X i.e, independent columns, and a variable y i.e, dependent column as the “Case_Status” column.**


In [64]:
X = df.drop('case_status', axis=1)
y = df['case_status']

In [None]:
y.head()

**Manual encoding target column**

In [66]:
# If the target column has Denied it is encoded as 1 others as 0
y= np.where(y=='Denied', 1,0)

In [None]:
y

## Feature Transformation

In [None]:
# distribution of data before scaling
plt.figure(figsize=(12, 6))
for i, col in enumerate(['no_of_employees','prevailing_wage','company_age']):
    plt.subplot(2, 2, i+1)
    sns.histplot(x=X[col], color='indianred')
    plt.xlabel(col)
    plt.tight_layout()

* No of employees and Copmany age column is skewed
* Apply a power transform featurewise to make data more Gaussian-like.

Power transforms are a family of parametric, monotonic transformations that are applied to make data more Gaussian-like. This is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired.

Currently, PowerTransformer supports the Box-Cox transform and the Yeo-Johnson transform.

**Checking Skewness**

**What is Skewness ?**

* Skewness refers to a distortion or asymmetry that deviates from the symmetrical bell curve, or normal distribution, in a set of data. If the curve is shifted to the left or to the right, it is said to be skewed. Skewness can be quantified as a representation of the extent to which a given distribution varies from a normal distribution. A normal distribution has a skew of zero

In [None]:
# Check Skewness
X[continuous_features].skew(axis=0, skipna=True)

- Positiviely Skewed : company_age, no_of_employees.
- We can handle outliers and then check the skewness.

## Apply Power Transformer to Check if it can reduces the outliers

In [70]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson')
transform_features = ['company_age', 'no_of_employees']
X_copy = pt.fit_transform(X[transform_features])

In [71]:
X_copy = pd.DataFrame(X_copy, columns=transform_features)

In [None]:
plt.figure(figsize=(12, 5))
for i, col in enumerate(transform_features):
    plt.subplot(1, 2, i+1)
    sns.histplot(x=X_copy[col], color='indianred')
    plt.xlabel(col)
    plt.tight_layout()

**Checking Skewness**

In [None]:
X_copy.skew(axis=0, skipna=True)

- Here Yeo-Johnson is used and it supports both positive or negative data for transformation.
- So Power Transformer with yeo-johnson can be used.

In [None]:
for feature in cat_features:
    print(feature,':', df[feature].nunique())

## Feature Encoding and Scaling

 **One Hot Encoding for Columns which had lesser unique values and not ordinal**
* One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.

**Ordinal Encoding for Columns which has many unique categories** 
* Ordinal encoding is used here as label encoder is supported for column transformer.
* Ordinal encoding is used for Ordinal Variable. Variable comprises a finite set of discrete values with a ranked ordering between values.

**Standard Scaler** 
* Standardize features by removing the mean and scaling to unit variance.

**Power Transformer**
* Power transforms are a technique for transforming numerical input or output variables to have a Gaussian or more-Gaussian-like probability distribution.

**Selecting number features for preprocessing**

In [75]:
num_features = list(X.select_dtypes(exclude="object").columns)

In [None]:
num_features

### **Preprocessing using Column Transformer**

In [77]:
# Create Column Transformer with 3 types of transformers
or_columns = ['has_job_experience','requires_job_training','full_time_position','education_of_employee']
oh_columns = ['continent','unit_of_wage','region_of_employment']
transform_columns= ['no_of_employees','company_age']

from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
ordinal_encoder = OrdinalEncoder()

transform_pipe = Pipeline(steps=[
    ('transformer', PowerTransformer(method='yeo-johnson'))
])

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, oh_columns),
        ("Ordinal_Encoder", ordinal_encoder, or_columns),
        ("Transformer", transform_pipe, transform_columns),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [78]:
X = preprocessor.fit_transform(X)

In [None]:
X

# Classification

In [80]:

from imblearn.combine import SMOTETomek, SMOTEENN
# Resampling the minority class. The strategy can be changed as required.
smt = SMOTEENN(random_state=42,sampling_strategy='minority' )
# Fit the model to generate the data.
X_res, y_res = smt.fit_resample(X, y)

## Train Test Split
- The train-test split procedure is used to estimate the performance of machine learning algorithms when they are used to make predictions on data not used to train the model.

- It is a fast and easy procedure to perform, the results of which allow you to compare the performance of machine learning algorithms.

In [None]:
from sklearn.model_selection import  train_test_split
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

In [82]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [83]:
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    f1 = f1_score(true, predicted) # Calculate F1-score
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted)  # Calculate Recall
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, f1 , precision, recall, roc_auc

In [84]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()

}

In [85]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
    models_list = []
    accuracy_list = []
    auc= []
    mlflow.set_experiment("baseline model 2")
    with mlflow.start_run(run_name="all_models_evaluation") as parent_run:
        print("Parent Run ID:", parent_run.info.run_id)
        for model_name, model in models.items():
            with mlflow.start_run(run_name=model_name, nested=True):
                model.fit(X_train, y_train)

                # Make predictions
                y_train_pred = model.predict(X_train)
                y_test_pred = model.predict(X_test)

                # Training set performance
                model_train_accuracy, model_train_f1,model_train_precision,\
                model_train_recall,model_train_rocauc_score=evaluate_clf(y_train ,y_train_pred)
                mlflow.log_metric('model_train_accuracy',model_train_accuracy)
                mlflow.log_metric('model_train_f1', model_train_f1)
                mlflow.log_metric('model_train_precision',model_train_precision)
                mlflow.log_metric('model_train_recall',model_train_recall)
                mlflow.log_metric('model_train_rocauc_score',model_train_rocauc_score)

                # Test set performance
                model_test_accuracy,model_test_f1,model_test_precision,\
                model_test_recall,model_test_rocauc_score=evaluate_clf(y_test, y_test_pred)
                mlflow.log_metric('model_test_accuracy',model_test_accuracy)
                mlflow.log_metric('model_test_f1', model_test_f1)
                mlflow.log_metric('model_test_precision',model_test_precision)
                mlflow.log_metric('model_test_recall',model_test_recall)
                mlflow.log_metric('model_test_rocauc_score',model_test_rocauc_score)
                # Collect results
                models_list.append(model_name)
                accuracy_list.append(model_test_accuracy)
                auc.append(model_test_rocauc_score)
                
    report=pd.DataFrame(list(zip(models_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'], ascending=False)   
    return report

## Model Training 

In [None]:
base_model_report =evaluate_models(X=X_res, y=y_res, models=models)

**Results of All Models**

In [None]:
base_model_report

**Here we can use Random Forest for Hyper Parameter Tuning**

**Define the parameter distribution for Random forest**

In [88]:
#Initialize few parameter for Hyperparamter tuning
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}

rf_params = {
    "max_depth": [10, 12, None, 15, 20],
    "max_features": ['sqrt', 'log2', None],
    "n_estimators": [10, 50, 100, 200]
}

knn_params = {
    "algorithm": ['auto', 'ball_tree', 'kd_tree','brute'],
    "weights": ['uniform', 'distance'],
    "n_neighbors": [3, 4, 5, 7, 9],
}

In [89]:
# Models list for Hyperparameter tuning
randomcv_models = [
    ('XGBoost', XGBClassifier(), xgboost_params),
    ("RF", RandomForestClassifier(), rf_params),
    ("KNN", KNeighborsClassifier(), knn_params)
]

**Create a function for model training and report which can be used in hyperparameter tuning loop**

In [None]:
from sklearn.model_selection import GridSearchCV

mlflow.set_experiment('Hyperparameter Tuning - Notebook')
model_param = {}

with mlflow.start_run(run_name="All Models Tuning") as parent_run:
    for name, model_instance, param_grid in randomcv_models:
        print(f"\n🔍 Running GridSearchCV for: {name}")

        grid_search = GridSearchCV(
            estimator=model_instance,
            param_grid=param_grid,
            cv=3,
            verbose=2,
            scoring='f1',
            n_jobs=-1
        )

        grid_search.fit(X_res, y_res)

        # Log each combination as a nested run
        for param_set, mean_score, std_score in zip(
            grid_search.cv_results_['params'],
            grid_search.cv_results_['mean_test_score'],
            grid_search.cv_results_['std_test_score']
        ):
            with mlflow.start_run(run_name=f"{name} | {param_set}", nested=True):
                model = model_instance.__class__(**param_set)
                model.fit(X_train, y_train)
                y_train_pred = model.predict(X_train)
                y_test_pred = model.predict(X_test)

                # Train set performance
                model_train_accuracy, model_train_f1, model_train_precision, \
                model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)
                mlflow.log_metric('model_train_accuracy', model_train_accuracy)
                mlflow.log_metric('model_train_f1', model_train_f1)
                mlflow.log_metric('model_train_precision', model_train_precision)
                mlflow.log_metric('model_train_recall', model_train_recall)
                mlflow.log_metric('model_train_rocauc_score', model_train_rocauc_score)

                # Test set performance
                model_test_accuracy, model_test_f1, model_test_precision, \
                model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)
                mlflow.log_metric('model_test_accuracy', model_test_accuracy)
                mlflow.log_metric('model_test_f1', model_test_f1)
                mlflow.log_metric('model_test_precision', model_test_precision)
                mlflow.log_metric('model_test_recall', model_test_recall)
                mlflow.log_metric('model_test_rocauc_score', model_test_rocauc_score)

        # Log best model and parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_f1 = grid_search.best_score_
        model_param[name] = best_params

        mlflow.log_params({f"{name}_best_" + k: v for k, v in best_params.items()})
        mlflow.log_metric(f"{name}_best_f1", best_f1)
        #mlflow.sklearn.log_model(best_model, artifact_path=f"{name}_best_model")

        print(f"🏆 Best {name}: Params = {best_params}, F1 = {best_f1:.4f}")

# Print best parameters
for model_name in model_param:
    print(f"\n---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])


In [None]:
model_param

## Retraining the Model with best Parameters

In [104]:
def evaluate_models2(X, y, models, model_param):
    '''
    This function evaluates multiple models with given hyperparameters.
    
    Args:
        X (pd.DataFrame): Feature data
        y (pd.Series): Target labels
        models (dict): Dictionary of model_name: model_object
        model_param (dict): Dictionary of model_name: hyperparameter dict

    Returns:
        pd.DataFrame: Report of accuracy and AUC for all models
    '''
  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models_list = []
    accuracy_list = []
    auc = []

    mlflow.set_experiment("Hyper parameter tuning")

    with mlflow.start_run(run_name="all_models_evaluation") as parent_run:
        print("Parent Run ID:", parent_run.info.run_id)

        for model_name in models:
            model = models[model_name]
            param = model_param.get(model_name, {})
            loggable_params = {
                f"{model_name}_{k}": "None" if v is None else v
                for k, v in param.items()
            }

            with mlflow.start_run(run_name=model_name, nested=True):


                # Log hyperparameters
                mlflow.log_params(loggable_params)

                # Fit model
                model.fit(X_train, y_train)

                # Predictions
                y_train_pred = model.predict(X_train)
                y_test_pred = model.predict(X_test)

                # Train performance
                model_train_accuracy, model_train_f1, model_train_precision, \
                model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)

                mlflow.log_metric('model_train_accuracy', model_train_accuracy)
                mlflow.log_metric('model_train_f1', model_train_f1)
                mlflow.log_metric('model_train_precision', model_train_precision)
                mlflow.log_metric('model_train_recall', model_train_recall)
                mlflow.log_metric('model_train_rocauc_score', model_train_rocauc_score)

                # Test performance
                model_test_accuracy, model_test_f1, model_test_precision, \
                model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)

                mlflow.log_metric('model_test_accuracy', model_test_accuracy)
                mlflow.log_metric('model_test_f1', model_test_f1)
                mlflow.log_metric('model_test_precision', model_test_precision)
                mlflow.log_metric('model_test_recall', model_test_recall)
                mlflow.log_metric('model_test_rocauc_score', model_test_rocauc_score)
                
                # Store results
                models_list.append(model_name)
                accuracy_list.append(model_test_accuracy)
                auc.append(model_test_rocauc_score)

    report = pd.DataFrame(
        list(zip(models_list, accuracy_list, auc)),
        columns=['Model Name', 'Accuracy', 'AUC']
    ).sort_values(by='Accuracy', ascending=False)

    return report


In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
best_models = {
    "RF": RandomForestClassifier(**model_param['RF']),
    "KNN": KNeighborsClassifier(**model_param['KNN']),
    "XGBoost": XGBClassifier(**model_param['XGBoost'], n_jobs=-1),
}
tuned_report =evaluate_models2(X=X_res, y=y_res, models=best_models,model_param=model_param)

In [None]:
tuned_report

In [None]:
best_model = KNeighborsClassifier(**model_param['KNN'])
best_model = best_model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)
score = accuracy_score(y_test,y_pred)
cr = classification_report(y_test,y_pred)

print("FINAL MODEL 'KNN'")
print ("Accuracy Score value: {:.4f}".format(score))
print (cr)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test)

## Best Model is K-Nearest Neighbor(KNN) with Accuracy 96.83%