In [3]:
from cleaning import data_new, X_train, X_test, y_train, y_test,X,y,pd

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import cross_validate
# Assuming X and y are your data
# Load your data (e.g., from a CSV file)
# data = pd.read_csv('your_data.csv')
# X = data.drop('target_column', axis=1)
# y = data['target_column']

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the target variable (y) to integer labels using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
unique_classes = np.unique(y)

# Calculate the number of classes
num_classes = len(unique_classes)

# Adjust the priors array
# If you have a single prior value, you may want to repeat it for each class, or specify different priors for each class.
priors = [0.3745401188473625] * num_classes
# Calculate the sum of priors
priors_sum = sum(priors)

# Normalize priors so that their sum equals 1
normalized_priors = [prior / priors_sum for prior in priors]

# Replace the original priors with the normalized priors
priors = normalized_priors
best_C = 3.7460266483547775
# Define numerical and categorical columns in the data
numeric_features = X_train.select_dtypes(include='number').columns
categorical_features = X_train.select_dtypes(include='object').columns

# Define pre-processing pipelines for numerical and categorical data
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define the models for comparison
models = {
    'Naive Bayes': GaussianNB(priors=priors),
    'KNN': KNeighborsClassifier(n_neighbors=7, weights='uniform'),
    'Random Forest': RandomForestClassifier(n_estimators=250, max_depth=10),
    'Logistic Regression': LogisticRegression(C=best_C, solver='liblinear')
}

# Create pipelines for each model
pipelines = {
    name: Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    for name, model in models.items()
}

# Train and evaluate each model
for name, pipeline in pipelines.items():
 
        # Train the model
        pipeline.fit(X_train, y_train)

        # Predict on test data
        y_pred = pipeline.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Print accuracy and classification report
        print(f"\n{name} Model")
       # print(f"Accuracy: {accuracy:.4f}")
        #print(classification_report(y_test, y_pred))

        # Optional: Perform cross-validation and print mean accuracy
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
        print(f"Cross-validated mean accuracy: {cv_scores.mean():.4f}")



Naive Bayes Model
Cross-validated mean accuracy: 0.5303

KNN Model
Cross-validated mean accuracy: 0.4981

Random Forest Model
Cross-validated mean accuracy: 0.4186

Logistic Regression Model
Cross-validated mean accuracy: 0.5519
