In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from io import StringIO

In [2]:
df = pd.read_csv("Visadataset_numerical.csv")

In [3]:
df.loc[df['no of employees'] < 0, 'no of employees'] = pd.NA

In [4]:
df['no of employees'] = df['no of employees'].fillna(df['no of employees'].median())

In [5]:
X = df.drop('case status', axis=1)
y = df['case status']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
item = [X_train, X_test, y_train, y_test]
for i in item:
    print(i.shape) 

(20042, 10)
(5011, 10)
(20042,)
(5011,)


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from io import StringIO
import warnings
warnings.filterwarnings('ignore')


In [21]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
}

In [22]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else np.zeros_like(y_pred)
    
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_prob) if y_prob.sum() > 0 else 0
    }

In [23]:
# Train and evaluate ML models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test)

Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training SVM...
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp37-cp37m-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.11.0
  Downloading tensorflow_intel-2.11.0-cp37-cp37m-win_amd64.whl (266.3 MB)
     -------------------------------------- 266.3/266.3 MB 2.4 MB/s eta 0:00:00
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
     ---------------------------------------- 65.5/65.5 kB 1.8 MB/s eta 0:00:00
Collecting grpcio<2.0,>=1.24.3
  Downloading grpcio-1.62.3-cp37-cp37m-win_amd64.whl (4.5 MB)
     ---------------------------------------- 4.5/4.5 MB 4.6 MB/s eta 0:00:00
Collecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 5.4 MB/s eta 0:00:00
Collecting h5py>=2.9.0
  Downloading h5py-3.8.0-cp37-cp37m-win_amd64.whl (2.6 MB)
     ---------------------------------

In [24]:
results_df = pd.DataFrame(results).T
print("\nModel Performance Summary:")
print(results_df)


Model Performance Summary:
                     Accuracy  Precision    Recall  F1-Score   ROC-AUC
Logistic Regression  0.656156   0.000000  0.000000  0.000000  0.540698
Decision Tree        0.655159   0.498530  0.492165  0.495327  0.616368
Random Forest        0.725404   0.630158  0.487522  0.549738  0.757486
SVM                  0.656156   0.000000  0.000000  0.000000  0.567954
KNN                  0.624027   0.424696  0.263494  0.325215  0.542660
XGBoost              0.743165   0.667951  0.503192  0.573982  0.774857
LightGBM             0.750549   0.678222  0.522345  0.590164  0.785834


In [28]:
# Find the model with the highest accuracy
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])
best_model = results_df['Accuracy'].idxmax()
best_accuracy = results_df['Accuracy'].max()

# Output the best model
print("\nHighest Accuracy Model:")
print(f"Model: {best_model}")
print(f"Accuracy: {best_accuracy:.4f}")


Highest Accuracy Model:
Model: LightGBM
Accuracy: 0.7505


In [30]:
import pickle

In [31]:
pickle.dump(best_model,open('model.pkl','wb'))