In [1]:
from cleaning import data_new, X_train, X_test, y_train, y_test

In [3]:
# Import necessary modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Handle missing values and categorical data with ColumnTransformer
numeric_features = X_train.columns.difference(['civilityTitle'])  # Update with your actual column names
categorical_features = ['civilityTitle']  # Update with your actual column names

numeric_transformer = SimpleImputer(strategy='mean')  # Use mean imputation for numeric data
categorical_transformer = SimpleImputer(strategy='most_frequent')  # Use most frequent for categorical data

# Create ColumnTransformer with both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define a pipeline with the preprocessor and PCA
pipeline_pca = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=100)),
    ('clf', RandomForestClassifier(n_estimators=100))
])

# Train the pipeline with PCA on the training data
pipeline_pca.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca = pipeline_pca.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca = accuracy_score(y_test, y_pred_pca)
report_pca = classification_report(y_test, y_pred_pca)

print("\nRandom Forest with PCA:")
print("Accuracy:", accuracy_pca)
print(report_pca)

# Define a second pipeline without PCA for comparison
pipeline_original = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100))
])

# Train the pipeline without PCA on the training data
pipeline_original.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original = pipeline_original.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original = accuracy_score(y_test, y_pred_original)
report_original = classification_report(y_test, y_pred_original)

print("\nRandom Forest without PCA:")
print("Accuracy:", accuracy_original)
print(report_original)





Random Forest with PCA:
Accuracy: 0.71900826446281
              precision    recall  f1-score   support

           0       0.72      0.82      0.77       136
           1       0.72      0.59      0.65       106

    accuracy                           0.72       242
   macro avg       0.72      0.71      0.71       242
weighted avg       0.72      0.72      0.71       242


Random Forest without PCA:
Accuracy: 0.7272727272727273
              precision    recall  f1-score   support

           0       0.74      0.79      0.76       136
           1       0.70      0.65      0.68       106

    accuracy                           0.73       242
   macro avg       0.72      0.72      0.72       242
weighted avg       0.73      0.73      0.73       242





In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# Define parameters for Random Forest
n_estimators = 100  # Number of trees in the forest
max_depth = 10  # Maximum depth of each tree (you can experiment with this value)
min_samples_split = 4  # Minimum samples required to split a node
min_samples_leaf = 2  # Minimum samples required at a leaf node
max_features = 'sqrt'  # Number of features to consider for best split

# Create a pipeline with the preprocessor and Random Forest with regularization
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42  # Seed for reproducibility
    ))
])

# Train the pipeline on the training data
pipeline_rf.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_rf = pipeline_rf.predict(X_test)

# Calculate accuracy and print classification report
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print("\nRandom Forest with regularization:")
print("Accuracy:", accuracy_rf)
print(report_rf)





Random Forest with regularization:
Accuracy: 0.6776859504132231
              precision    recall  f1-score   support

           0       0.69      0.79      0.73       136
           1       0.66      0.54      0.59       106

    accuracy                           0.68       242
   macro avg       0.67      0.66      0.66       242
weighted avg       0.68      0.68      0.67       242





In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Define a pipeline with the preprocessor, scaler, PCA, and Logistic Regression
pipeline_pca_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.80)),  # Adjust n_components as needed
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the pipeline with PCA on the training data
pipeline_pca_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca_lr = pipeline_pca_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca_lr = accuracy_score(y_test, y_pred_pca_lr)
report_pca_lr = classification_report(y_test, y_pred_pca_lr)

print("\nLogistic Regression with PCA:")
print("Accuracy:", accuracy_pca_lr)
print(report_pca_lr)

# Define a second pipeline without PCA for comparison
pipeline_original_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the pipeline without PCA on the training data
pipeline_original_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original_lr = pipeline_original_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original_lr = accuracy_score(y_test, y_pred_original_lr)
report_original_lr = classification_report(y_test, y_pred_original_lr)

print("\nLogistic Regression without PCA:")
print("Accuracy:", accuracy_original_lr)
print(report_original_lr)





Logistic Regression with PCA:
Accuracy: 0.5950413223140496
              precision    recall  f1-score   support

           0       0.61      0.75      0.68       136
           1       0.55      0.40      0.46       106

    accuracy                           0.60       242
   macro avg       0.58      0.57      0.57       242
weighted avg       0.59      0.60      0.58       242


Logistic Regression without PCA:
Accuracy: 0.5867768595041323
              precision    recall  f1-score   support

           0       0.61      0.73      0.66       136
           1       0.54      0.41      0.46       106

    accuracy                           0.59       242
   macro avg       0.57      0.57      0.56       242
weighted avg       0.58      0.59      0.58       242



In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Define the regularization type (penalty) and regularization strength (C)
# 'l1' for L1 regularization and 'l2' for L2 regularization
penalty_type = 'l2'  # Change to 'l1' if you want L1 regularization
C_value = 1.0  # Regularization strength; you can experiment with different values

# Define the logistic regression model with the appropriate solver for L1 regularization
solver_type = 'liblinear' if penalty_type == 'l1' else 'lbfgs'

# Create a pipeline without PCA, including preprocessing and logistic regression
pipeline_no_pca_lr = Pipeline([
    ('preprocessor', preprocessor),  # Column transformer for data preprocessing
    ('scaler', StandardScaler()),  # Standard scaling (optional but recommended)
    ('clf', LogisticRegression(
        penalty=penalty_type,
        C=C_value,
        solver=solver_type,
        max_iter=1000,
        random_state=42  # Seed for reproducibility
    ))
])

# Train the pipeline with logistic regression (no PCA) on the training data
pipeline_no_pca_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_no_pca_lr = pipeline_no_pca_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_no_pca_lr = accuracy_score(y_test, y_pred_no_pca_lr)
report_no_pca_lr = classification_report(y_test, y_pred_no_pca_lr)

print("\nLogistic Regression without PCA but with regularization:")
print("Accuracy:", accuracy_no_pca_lr)
print(report_no_pca_lr)



Logistic Regression without PCA but with regularization:
Accuracy: 0.5867768595041323
              precision    recall  f1-score   support

           0       0.61      0.73      0.66       136
           1       0.54      0.41      0.46       106

    accuracy                           0.59       242
   macro avg       0.57      0.57      0.56       242
weighted avg       0.58      0.59      0.58       242





In [7]:

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Define a pipeline with the preprocessor, scaler, PCA, and Logistic Regression
pipeline_pca_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.80)),  # Adjust n_components as needed
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the pipeline with PCA on the training data
pipeline_pca_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca_lr = pipeline_pca_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca_lr = accuracy_score(y_test, y_pred_pca_lr)
report_pca_lr = classification_report(y_test, y_pred_pca_lr)

print("\nLogistic Regression with PCA:")
print("Accuracy:", accuracy_pca_lr)
print(report_pca_lr)
# Define the regularization type (penalty) and regularization strength (C)
# 'l1' for L1 regularization and 'l2' for L2 regularization
penalty_type = 'l2'  # Change to 'l2' if you want L2 regularization
C_value = 1.0  # Regularization strength; you can experiment with different values

# Define the logistic regression model with the appropriate solver for L1 regularization
solver_type = 'liblinear' if penalty_type == 'l1' else 'lbfgs'

# Create a pipeline with preprocessing, PCA, and logistic regression
pipeline_pca_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=100)),  # Adjust the number of components as needed
    ('clf', LogisticRegression(
        penalty=penalty_type,
        C=C_value,
        solver=solver_type,
        max_iter=1000,
        random_state=42  # Seed for reproducibility
    ))
])

# Train the pipeline with PCA and logistic regression on the training data
pipeline_pca_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_pca_lr = pipeline_pca_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_pca_lr = accuracy_score(y_test, y_pred_pca_lr)
report_pca_lr = classification_report(y_test, y_pred_pca_lr)

print("\nLogistic Regression with PCA and regularization:")
print("Accuracy:", accuracy_pca_lr)
print(report_pca_lr)





Logistic Regression with PCA:
Accuracy: 0.5950413223140496
              precision    recall  f1-score   support

           0       0.61      0.75      0.68       136
           1       0.55      0.40      0.46       106

    accuracy                           0.60       242
   macro avg       0.58      0.57      0.57       242
weighted avg       0.59      0.60      0.58       242


Logistic Regression with PCA and regularization:
Accuracy: 0.5867768595041323
              precision    recall  f1-score   support

           0       0.61      0.72      0.66       136
           1       0.54      0.42      0.47       106

    accuracy                           0.59       242
   macro avg       0.57      0.57      0.57       242
weighted avg       0.58      0.59      0.58       242



