In [4]:
from cleaning import data_new, X_train, X_test, y_train, y_test,numeric_columns,X,y,np,pd

In [5]:
# Import necessary libraries and modules
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from nb import NaiveBayesClassifier
import matplotlib.pyplot as plt
# Define degree for polynomial features (adjust as needed)
degree = 2
poly = PolynomialFeatures(degree=degree)

# Define imputation and transformation strategies
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Pipeline for Gaussian Naive Bayes without PCA
pipeline_without_pca = Pipeline([
    ('preprocessor', preprocessor),
    ('clf',NaiveBayesClassifier())
])

# Fit the pipeline to the training data
pipeline_without_pca.fit(X_train, y_train)

# Use the pipeline to predict on the test data
y_pred_without_pca = pipeline_without_pca.predict(X_test)
print('Naive Bayes : ')
# Calculate accuracy and print classification report for without PCA
accuracy_without_pca = accuracy_score(y_test, y_pred_without_pca)
print("Accuracy without PCA:", accuracy_without_pca)


# Steps for Gaussian Naive Bayes with PCA
# Step 1: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Step 2: Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Step 3: Apply PCA
pca = PCA(n_components=0.93)  # Adjust the variance as needed
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# plt.figure(figsize=(4,3))
# plt.plot(range(1, len(pca.explained_variance_) + 1), pca.explained_variance_, marker='o')
# plt.title('Scree Plot')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Explained Variance')
# plt.show()

# # Explained variance ratio plot
# plt.figure(figsize=(4,3))
# plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
# plt.title('Explained Variance Ratio')
# plt.xlabel('Principal Components')
# plt.ylabel('Variance Ratio')
# plt.show()

# # Scatter plot of first two principal components
# plt.figure(figsize=(4,3))
# plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap='viridis')
# plt.title('Scatter Plot of Principal Components')
# plt.xlabel('PC1')
# plt.ylabel('PC2')
# plt.colorbar(label='Species')
# plt.show()
# Step 4: Train and evaluate Gaussian Naive Bayes with PCA
gnb_pca = GaussianNB()
gnb_pca.fit(X_train_pca, y_train)
y_pred_pca = gnb_pca.predict(X_test_pca)

# Calculate accuracy and print classification report for with PCA
accuracy_with_pca = accuracy_score(y_test, y_pred_pca)
print("Accuracy with PCA:", accuracy_with_pca)



Naive Bayes : 
Accuracy without PCA: 0.4090909090909091
Accuracy with PCA: 0.5619834710743802


In [7]:
from sklearn.ensemble import RandomForestClassifier

# Define the column groups for pre-processing
numeric_features = X_train.select_dtypes(include=np.number).columns
categorical_features = X_train.select_dtypes(include='object').columns

# Define transformers for numeric and categorical data
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create ColumnTransformer with both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define a pipeline with the preprocessor, PCA, and classifier
pipeline_pca = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=100)),
    ('clf', RandomForestClassifier(random_state=42))
])

# Train the pipeline with PCA on the training data
pipeline_pca.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca = pipeline_pca.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca = accuracy_score(y_test, y_pred_pca)

print("Random Forest")
print("Accuracy with PCA:", accuracy_pca)

# Define a second pipeline without PCA for comparison
pipeline_original = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Train the pipeline without PCA on the training data
pipeline_original.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original = pipeline_original.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original = accuracy_score(y_test, y_pred_original)

print("Accuracy without PCA:", accuracy_original)

Random Forest
Accuracy with PCA: 0.743801652892562
Accuracy without PCA: 0.743801652892562


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Define a pipeline with the preprocessor, scaler, PCA, and Logistic Regression
pipeline_pca_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.80)),  # Adjust n_components as needed
    ('clf', LogisticRegression(C=1, solver='liblinear'))
])

# Train the pipeline with PCA on the training data
pipeline_pca_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca_lr = pipeline_pca_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca_lr = accuracy_score(y_test, y_pred_pca_lr)


print("Logistic Regression")
print("Accuracy with PCA:", accuracy_pca_lr)

best_C = 3.7460266483547775
# Define a second pipeline without PCA for comparison
pipeline_original_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=100))
])

# Train the pipeline without PCA on the training data
pipeline_original_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original_lr = pipeline_original_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original_lr = accuracy_score(y_test, y_pred_original_lr)

print("Accuracy without PCA:", accuracy_original_lr)



Logistic Regression
Accuracy with PCA: 0.5950413223140496
Accuracy without PCA: 0.5867768595041323


In [10]:
from sklearn.neighbors import KNeighborsClassifier
# Define a pipeline with the preprocessor, scaler, PCA, and K-Nearest Neighbors
pipeline_pca_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=100)),  # Adjust n_components as needed
    ('clf', KNeighborsClassifier())
])

# Train the pipeline with PCA on the training data
pipeline_pca_knn.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca_knn = pipeline_pca_knn.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca_knn = accuracy_score(y_test, y_pred_pca_knn)


print("K-Nearest Neighbors")
print("Accuracy with PCA:", accuracy_pca_knn)


# Define a second pipeline without PCA for comparison
pipeline_original_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])

# Train the pipeline without PCA on the training data
pipeline_original_knn.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original_knn = pipeline_original_knn.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original_knn = accuracy_score(y_test, y_pred_original_knn)
print("Accuracy without PCA:", accuracy_original_knn)



K-Nearest Neighbors
Accuracy with PCA: 0.6115702479338843
Accuracy without PCA: 0.6033057851239669
