In [10]:
from cleaning import data_new, X_train, X_test, y_train, y_test,numeric_columns

In [15]:
# Import necessary libraries and modules
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA

# Define degree for polynomial features (adjust as needed)
degree = 2
poly = PolynomialFeatures(degree=degree)

# Define imputation and transformation strategies
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Pipeline for Gaussian Naive Bayes without PCA
pipeline_without_pca = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', GaussianNB())
])

# Fit the pipeline to the training data
pipeline_without_pca.fit(X_train, y_train)

# Use the pipeline to predict on the test data
y_pred_without_pca = pipeline_without_pca.predict(X_test)

# Calculate accuracy and print classification report for without PCA
accuracy_without_pca = accuracy_score(y_test, y_pred_without_pca)
print("Accuracy without PCA:", accuracy_without_pca)
print("Classification Report without PCA:")
print(classification_report(y_test, y_pred_without_pca))

# Steps for Gaussian Naive Bayes with PCA
# Step 1: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Step 2: Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Step 3: Apply PCA
pca = PCA(n_components=0.93)  # Adjust the variance as needed
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Step 4: Train and evaluate Gaussian Naive Bayes with PCA
gnb_pca = GaussianNB()
gnb_pca.fit(X_train_pca, y_train)
y_pred_pca = gnb_pca.predict(X_test_pca)

# Calculate accuracy and print classification report for with PCA
accuracy_with_pca = accuracy_score(y_test, y_pred_pca)
print("Accuracy with PCA:", accuracy_with_pca)
print("Classification Report with PCA:")
print(classification_report(y_test, y_pred_pca))




Accuracy without PCA: 0.4380165289256198
Classification Report without PCA:
              precision    recall  f1-score   support

           0       0.50      0.26      0.34       136
           1       0.41      0.67      0.51       106

    accuracy                           0.44       242
   macro avg       0.46      0.46      0.43       242
weighted avg       0.46      0.44      0.41       242

Accuracy with PCA: 0.5619834710743802
Classification Report with PCA:
              precision    recall  f1-score   support

           0       0.60      0.66      0.63       136
           1       0.50      0.43      0.46       106

    accuracy                           0.56       242
   macro avg       0.55      0.55      0.55       242
weighted avg       0.56      0.56      0.56       242



In [12]:
# Import necessary modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Handle missing values and categorical data with ColumnTransformer
numeric_features = X_train.columns.difference(['civilityTitle'])  # Update with your actual column names
categorical_features = ['civilityTitle']  # Update with your actual column names

numeric_transformer = SimpleImputer(strategy='mean')  # Use mean imputation for numeric data
categorical_transformer = SimpleImputer(strategy='most_frequent')  # Use most frequent for categorical data

# Create ColumnTransformer with both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define a pipeline with the preprocessor and PCA
pipeline_pca = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=100)),
    ('clf', RandomForestClassifier(n_estimators=100))
])

# Train the pipeline with PCA on the training data
pipeline_pca.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca = pipeline_pca.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca = accuracy_score(y_test, y_pred_pca)
report_pca = classification_report(y_test, y_pred_pca)

print("\nRandom Forest with PCA:")
print("Accuracy:", accuracy_pca)
print(report_pca)

# Define a second pipeline without PCA for comparison
pipeline_original = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100))
])

# Train the pipeline without PCA on the training data
pipeline_original.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original = pipeline_original.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original = accuracy_score(y_test, y_pred_original)
report_original = classification_report(y_test, y_pred_original)

print("\nRandom Forest without PCA:")
print("Accuracy:", accuracy_original)
print(report_original)





Random Forest with PCA:
Accuracy: 0.7355371900826446
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       136
           1       0.72      0.65      0.68       106

    accuracy                           0.74       242
   macro avg       0.73      0.73      0.73       242
weighted avg       0.73      0.74      0.73       242


Random Forest without PCA:
Accuracy: 0.7231404958677686
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       136
           1       0.70      0.65      0.67       106

    accuracy                           0.72       242
   macro avg       0.72      0.72      0.72       242
weighted avg       0.72      0.72      0.72       242





In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Define a pipeline with the preprocessor, scaler, PCA, and Logistic Regression
pipeline_pca_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.80)),  # Adjust n_components as needed
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the pipeline with PCA on the training data
pipeline_pca_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca_lr = pipeline_pca_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca_lr = accuracy_score(y_test, y_pred_pca_lr)
report_pca_lr = classification_report(y_test, y_pred_pca_lr)

print("\nLogistic Regression with PCA:")
print("Accuracy:", accuracy_pca_lr)
print(report_pca_lr)

# Define a second pipeline without PCA for comparison
pipeline_original_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the pipeline without PCA on the training data
pipeline_original_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original_lr = pipeline_original_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original_lr = accuracy_score(y_test, y_pred_original_lr)
report_original_lr = classification_report(y_test, y_pred_original_lr)

print("\nLogistic Regression without PCA:")
print("Accuracy:", accuracy_original_lr)
print(report_original_lr)





Logistic Regression with PCA:
Accuracy: 0.5950413223140496
              precision    recall  f1-score   support

           0       0.61      0.75      0.68       136
           1       0.55      0.40      0.46       106

    accuracy                           0.60       242
   macro avg       0.58      0.57      0.57       242
weighted avg       0.59      0.60      0.58       242


Logistic Regression without PCA:
Accuracy: 0.5867768595041323
              precision    recall  f1-score   support

           0       0.61      0.73      0.66       136
           1       0.54      0.41      0.46       106

    accuracy                           0.59       242
   macro avg       0.57      0.57      0.56       242
weighted avg       0.58      0.59      0.58       242





In [14]:
from sklearn.neighbors import KNeighborsClassifier
# Define a pipeline with the preprocessor, scaler, PCA, and K-Nearest Neighbors
pipeline_pca_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=100)),  # Adjust n_components as needed
    ('clf', KNeighborsClassifier())
])

# Train the pipeline with PCA on the training data
pipeline_pca_knn.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca_knn = pipeline_pca_knn.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca_knn = accuracy_score(y_test, y_pred_pca_knn)
report_pca_knn = classification_report(y_test, y_pred_pca_knn)

print("\nK-Nearest Neighbors with PCA:")
print("Accuracy:", accuracy_pca_knn)
print(report_pca_knn)

# Define a second pipeline without PCA for comparison
pipeline_original_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])

# Train the pipeline without PCA on the training data
pipeline_original_knn.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original_knn = pipeline_original_knn.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original_knn = accuracy_score(y_test, y_pred_original_knn)
report_original_knn = classification_report(y_test, y_pred_original_knn)

print("\nK-Nearest Neighbors without PCA:")
print("Accuracy:", accuracy_original_knn)
print(report_original_knn)





K-Nearest Neighbors with PCA:
Accuracy: 0.6033057851239669
              precision    recall  f1-score   support

           0       0.65      0.63      0.64       136
           1       0.55      0.57      0.56       106

    accuracy                           0.60       242
   macro avg       0.60      0.60      0.60       242
weighted avg       0.61      0.60      0.60       242


K-Nearest Neighbors without PCA:
Accuracy: 0.6033057851239669
              precision    recall  f1-score   support

           0       0.65      0.63      0.64       136
           1       0.55      0.57      0.56       106

    accuracy                           0.60       242
   macro avg       0.60      0.60      0.60       242
weighted avg       0.61      0.60      0.60       242



