In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from io import StringIO

In [5]:
df = pd.read_csv("Visadataset_numerical.csv")

In [6]:
df.loc[df['no of employees'] < 0, 'no of employees'] = pd.NA

In [7]:
df['no of employees'] = df['no of employees'].fillna(df['no of employees'].median())

In [8]:
X = df.drop('case status', axis=1)
y = df['case status']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
item = [X_train, X_test, y_train, y_test]
for i in item:
    print(i.shape) 

(20042, 10)
(5011, 10)
(20042,)
(5011,)


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from io import StringIO
import warnings
warnings.filterwarnings('ignore')


In [12]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
}

In [13]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else np.zeros_like(y_pred)
    
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_prob) if y_prob.sum() > 0 else 0
    }

In [14]:
# Train and evaluate ML models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test)

Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training SVM...
Training KNN...
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Number of positive: 6641, number of negative: 13401
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 20042, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.331354 -> initscore=-0.702067
[LightGBM] [Info] Start training from score -0.702067


In [15]:
results_df = pd.DataFrame(results).T
print("\nModel Performance Summary:")
print(results_df)


Model Performance Summary:
                     Accuracy  Precision    Recall  F1-Score   ROC-AUC
Logistic Regression  0.656156   0.000000  0.000000  0.000000  0.540698
Decision Tree        0.655159   0.498530  0.492165  0.495327  0.616368
Random Forest        0.725404   0.630158  0.487522  0.549738  0.757486
SVM                  0.656156   0.000000  0.000000  0.000000  0.567954
KNN                  0.624027   0.424696  0.263494  0.325215  0.542660
XGBoost              0.743165   0.667951  0.503192  0.573982  0.774857
LightGBM             0.750549   0.678222  0.522345  0.590164  0.785834


In [16]:
# Find the model with the highest accuracy
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])
best_model = results_df['Accuracy'].idxmax()
best_accuracy = results_df['Accuracy'].max()

# Output the best model
print("\nHighest Accuracy Model:")
print(f"Model: {best_model}")
print(f"Accuracy: {best_accuracy:.4f}")


Highest Accuracy Model:
Model: LightGBM
Accuracy: 0.7505


In [24]:
from lightgbm import LGBMClassifier

In [33]:
mod = LGBMClassifier(random_state=42)

In [34]:
mod.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 6641, number of negative: 13401
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 20042, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.331354 -> initscore=-0.702067
[LightGBM] [Info] Start training from score -0.702067


LGBMClassifier(random_state=42)

In [35]:
mod.score(X_test,y_test)

0.7505487926561565

In [36]:
pickle.dump(mod,open('mod.pkl','wb'))

In [37]:
loaded_model = pickle.load(open('mod.pkl', 'rb'))

In [38]:
# Predict
prediction = loaded_model.predict(sample)
print("Prediction:", prediction)

Prediction: [1]


In [39]:
import pickle

In [18]:
pickle.dump(best_model,open('model.pkl','wb'))

In [19]:
# Load the saved model
loaded_model = pickle.load(open('model.pkl', 'rb'))


In [22]:
sample = [[1,	0,	0,	0,	98,	1897,	4,	83434.0300,	3,	1]] 

In [40]:
from lightgbm import LGBMClassifier

# Train the model
mod = LGBMClassifier()
mod.fit(X_train, y_train)

# Evaluate on test data
accuracy = mod.score(X_test, y_test)  # Returns accuracy
print("Accuracy:", accuracy)


[LightGBM] [Info] Number of positive: 6641, number of negative: 13401
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 20042, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.331354 -> initscore=-0.702067
[LightGBM] [Info] Start training from score -0.702067
Accuracy: 0.7505487926561565
