In [1]:
from cleaning import data_new, X_train, X_test, y_train, y_test

In [10]:
# Import necessary libraries
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Define imputation strategies
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Define numerical and categorical features
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create transformers for numerical and categorical data
numeric_transformer = Pipeline([
    ('imputer', numeric_imputer),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Add PolynomialFeatures
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a ColumnTransformer to handle numerical and categorical features
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create a pipeline with the preprocessor and Gaussian Naive Bayes classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', GaussianNB())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Use the pipeline to predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with polynomial features:", accuracy)
print(classification_report(y_test, y_pred))




Accuracy with polynomial features: 0.5950413223140496
              precision    recall  f1-score   support

           0       0.60      0.84      0.70       136
           1       0.58      0.28      0.38       106

    accuracy                           0.60       242
   macro avg       0.59      0.56      0.54       242
weighted avg       0.59      0.60      0.56       242





In [11]:
# Import necessary libraries
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

# Define imputation strategies
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Define numerical and categorical features
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create transformers for numerical and categorical data
numeric_transformer = Pipeline([
    ('imputer', numeric_imputer),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Add PolynomialFeatures
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.93))  # PCA with 93% variance retention
])

categorical_transformer = Pipeline([
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a ColumnTransformer to handle numerical and categorical features
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create a pipeline with the preprocessor and Gaussian Naive Bayes classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', GaussianNB())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Use the pipeline to predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with polynomial features and PCA:", accuracy)
print(classification_report(y_test, y_pred))




Accuracy with polynomial features and PCA: 0.5661157024793388
              precision    recall  f1-score   support

           0       0.59      0.74      0.66       136
           1       0.51      0.35      0.41       106

    accuracy                           0.57       242
   macro avg       0.55      0.54      0.53       242
weighted avg       0.55      0.57      0.55       242



In [12]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report


In [13]:
# Create numerical and categorical transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.93))
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create KNN pipeline
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', KNeighborsClassifier())
])

# Fit the pipeline
knn_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn_pipeline.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy with polynomial features and PCA:", accuracy_knn)
print(classification_report(y_test, y_pred_knn))




KNN Accuracy with polynomial features and PCA: 0.5702479338842975
              precision    recall  f1-score   support

           0       0.62      0.62      0.62       136
           1       0.51      0.50      0.50       106

    accuracy                           0.57       242
   macro avg       0.56      0.56      0.56       242
weighted avg       0.57      0.57      0.57       242



In [14]:
# Create numerical and categorical transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create KNN pipeline
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', KNeighborsClassifier())
])

# Fit the pipeline
knn_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn_pipeline.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy with polynomial features (without PCA):", accuracy_knn)
print(classification_report(y_test, y_pred_knn))




KNN Accuracy with polynomial features (without PCA): 0.5909090909090909
              precision    recall  f1-score   support

           0       0.64      0.62      0.63       136
           1       0.53      0.56      0.54       106

    accuracy                           0.59       242
   macro avg       0.59      0.59      0.59       242
weighted avg       0.59      0.59      0.59       242



In [15]:
# Create numerical and categorical transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create Random Forest pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier())
])

# Fit the pipeline
rf_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_pipeline.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy with polynomial features (without PCA):", accuracy_rf)
print(classification_report(y_test, y_pred_rf))




Random Forest Accuracy with polynomial features (without PCA): 0.7231404958677686
              precision    recall  f1-score   support

           0       0.73      0.79      0.76       136
           1       0.71      0.63      0.67       106

    accuracy                           0.72       242
   macro avg       0.72      0.71      0.71       242
weighted avg       0.72      0.72      0.72       242





In [16]:
# Create numerical and categorical transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.93))
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create Random Forest pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier())
])

# Fit the pipeline
rf_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_pipeline.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy with polynomial features and PCA:", accuracy_rf)
print(classification_report(y_test, y_pred_rf))




Random Forest Accuracy with polynomial features and PCA: 0.7272727272727273
              precision    recall  f1-score   support

           0       0.74      0.80      0.77       136
           1       0.71      0.63      0.67       106

    accuracy                           0.73       242
   macro avg       0.72      0.72      0.72       242
weighted avg       0.73      0.73      0.72       242





In [17]:
# Create numerical and categorical transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.93))
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create Logistic Regression pipeline
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression())
])

# Fit the pipeline
log_reg_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_log_reg = log_reg_pipeline.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Logistic Regression Accuracy with polynomial features and PCA:", accuracy_log_reg)
print(classification_report(y_test, y_pred_log_reg))




Logistic Regression Accuracy with polynomial features and PCA: 0.628099173553719
              precision    recall  f1-score   support

           0       0.66      0.71      0.68       136
           1       0.59      0.52      0.55       106

    accuracy                           0.63       242
   macro avg       0.62      0.62      0.62       242
weighted avg       0.62      0.63      0.62       242



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Create numerical and categorical transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create Logistic Regression pipeline
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression())
])

# Fit the pipeline
log_reg_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_log_reg = log_reg_pipeline.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Logistic Regression Accuracy with polynomial features (without PCA):", accuracy_log_reg)
print(classification_report(y_test, y_pred_log_reg))




Logistic Regression Accuracy with polynomial features (without PCA): 0.6859504132231405
              precision    recall  f1-score   support

           0       0.73      0.70      0.71       136
           1       0.63      0.67      0.65       106

    accuracy                           0.69       242
   macro avg       0.68      0.68      0.68       242
weighted avg       0.69      0.69      0.69       242



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
