In [1]:
from cleaning import data_new, X_train, X_test, y_train, y_test

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming X and y are your data
# X, y = ...

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical and numerical columns
categorical_cols = X_train.select_dtypes(include='object').columns
numerical_cols = X_train.select_dtypes(include=np.number).columns

# Define preprocessing pipelines
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Combine preprocessors using ColumnTransformer
preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
])

# Apply preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert y_train and y_test to integers using LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Define mutual information selector
k_best = SelectKBest(score_func=mutual_info_classif, k=10)  # Adjust k as needed

# Fit and transform the training data
X_train_selected = k_best.fit_transform(X_train_transformed, y_train)
X_test_selected = k_best.transform(X_test_transformed)

# Train KNN model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_selected, y_train)

# Predict on test data
y_pred = knn_model.predict(X_test_selected)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
print("KNN Model (Mutual Information Feature Selection)")
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


KNN Model (Mutual Information Feature Selection)
Accuracy: 0.5702479338842975
              precision    recall  f1-score   support

           0       0.57      0.99      0.72       136
           1       0.75      0.03      0.05       106

    accuracy                           0.57       242
   macro avg       0.66      0.51      0.39       242
weighted avg       0.65      0.57      0.43       242

