In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

def dense_transform(X):
    if hasattr(X, 'toarray'):
        return X.toarray()
    return X

data_path = r"D:\SEMESTER_4\deploy_all\v6.csv"
data = pd.read_csv(data_path)

numerical_cols = ['Value', 'Population', 'hci_Rank', 'hci_index', 'hdi_index', 'migration_country_population']
categorical_cols = ['Country_code', 'nationality_country', 'Variable', 'Year_x', 'migration_country_population_bins']

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

dense_transformer = FunctionTransformer(dense_transform, accept_sparse=True)

naive_bayes_model = Pipeline([
    ('preprocessor', preprocessor),
    ('to_dense', dense_transformer),
    ('classifier', GaussianNB())
])
naive_bayes_model.fit(X_train, y_train)

naive_bayes_preds = naive_bayes_model.predict(X_test)
print('Naive Bayes Accuracy:', accuracy_score(y_test, naive_bayes_preds))

joblib.dump(naive_bayes_model, 'naive_bayes_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
print("Model and Label Encoder have been serialized into pickle files.")


Naive Bayes Accuracy: 0.9041258031788976
Model and Label Encoder have been serialized into pickle files.
