In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.exceptions import FitFailedWarning
from sklearn.svm import SVC
from sklearn.tree import plot_tree

In [48]:
train= pd.read_csv("C:/Users/hblin/OneDrive - Cal Poly/GSB 544/Kaggle/gsb-544-fall-2024-political-affiliations/CAH-201803-train.csv")

In [49]:
X = train.drop(["id_num", "political_affiliation"], axis= 1)

y= train["political_affiliation"]

In [50]:
test_data = pd.read_csv("C:/Users/hblin/OneDrive - Cal Poly/GSB 544/Kaggle/gsb-544-fall-2024-political-affiliations/CAH-201803-test.csv")

In [7]:
# Define the preprocessing step
ct = ColumnTransformer(
    [
        ("dummify", 
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
         make_column_selector(dtype_include=object)),
        ("standardize", 
         StandardScaler(), 
         make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
).set_output(transform="pandas")

def evaluate_grid_search(grid_search, X, y):
    """
    This function performs grid search, evaluates the best model, and prints the results.

    Parameters:
    - grid_search: The GridSearchCV object that has been set up with the pipeline and parameter grid.
    - X: The feature matrix.
    - y: The target vector.

    Returns:
    - None
    """
    # Suppress warnings only in this block
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FitFailedWarning)
        grid_search.fit(X, y)  # Perform GridSearchCV without warnings

    # Output the best parameters and ROC-AUC score
    print("Best Parameters:", grid_search.best_params_)
    print(f"Best Cross-validated Metric: {grid_search.best_score_:.4f}")

    # Make predictions with the best estimator
    y_pred = grid_search.best_estimator_.predict(X)

    # Print the classification report and confusion matrix
    print("Classification Report:")
    print(classification_report(y, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y, y_pred))

# Example usage:
# evaluate_grid_search(grid_search, X, y)

# Politics

# Logistic

In [117]:
param_grid = {
    "lgr__C": (1,.01,.001),  # Regularization strength (covers a wider range)
    "lgr__penalty": ["l1","none"],  # Adding "none" to include no regularization
    "lgr__solver": ["lbfgs", "liblinear"],  # Adding all solvers supported
    "lgr__class_weight": [None, "balanced"],  # Adjust weights for imbalanced data
    "lgr__l1_ratio": (.1, 0.001, 1),  # ElasticNet mix ratio (used with solver="saga")
}

lgr_pipeline = Pipeline(
    [("preprocessing", ct),
     ("lgr", LogisticRegression())]  # Default classifier
)

grid_search = GridSearchCV(
    lgr_pipeline,
    param_grid,
    cv=5,  # Ensures proper class distribution
    scoring="accuracy",
    n_jobs=-1
)
evaluate_grid_search(grid_search, X, y)

Best Parameters: {'lgr__C': 1, 'lgr__class_weight': 'balanced', 'lgr__l1_ratio': 0.1, 'lgr__penalty': 'l1', 'lgr__solver': 'liblinear'}
Best Cross-validated Metric: 0.6275
Classification Report:
              precision    recall  f1-score   support

    Democrat       0.68      0.75      0.71        59
 Independent       0.65      0.59      0.62        56
  Republican       0.79      0.78      0.79        54

    accuracy                           0.70       169
   macro avg       0.71      0.70      0.70       169
weighted avg       0.70      0.70      0.70       169

Confusion Matrix:
[[44 12  3]
 [15 33  8]
 [ 6  6 42]]


        nan        nan        nan 0.62156863        nan        nan
        nan 0.62745098        nan        nan        nan 0.62745098
        nan        nan        nan 0.62745098        nan        nan
        nan 0.34919786        nan        nan        nan 0.34919786
        nan        nan        nan 0.34919786        nan        nan
        nan 0.34919786        nan        nan        nan 0.34919786
        nan        nan        nan 0.34919786        nan        nan
        nan 0.34919786        nan        nan        nan 0.34919786
        nan        nan        nan 0.34919786        nan        nan
        nan 0.34919786        nan        nan        nan 0.34919786
        nan        nan        nan 0.34919786        nan        nan]



ridge and square gr liv area

In [None]:
fin_param_grid = {
    "lgr__C": [1],  # Regularization strength
    "lgr__class_weight": ["balanced"],  # Adjust weights based on class frequencies
    "lgr__l1_ratio": [0.1],  # ElasticNet mixing parameter (only relevant if 'elasticnet' is used)
    "lgr__penalty": ["l2"],
    "lgr__solver": ["liblinear"]  # Solver
}

lgr_pipeline = Pipeline(
    [("preprocessing", ct),
     ("lgr", LogisticRegression())]  # Default classifier
)

final_model_fit = GridSearchCV(
    lgr_pipeline,
    fin_param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)
evaluate_grid_search(final_model_fit, X, y)

Best Parameters: {'lgr__C': 1, 'lgr__class_weight': 'balanced', 'lgr__l1_ratio': 0.1, 'lgr__penalty': 'l2', 'lgr__solver': 'liblinear'}
Best Cross-validated Metric: 0.6332
Classification Report:
              precision    recall  f1-score   support

    Democrat       0.68      0.73      0.70        59
 Independent       0.67      0.64      0.65        56
  Republican       0.81      0.78      0.79        54

    accuracy                           0.72       169
   macro avg       0.72      0.72      0.72       169
weighted avg       0.72      0.72      0.72       169

Confusion Matrix:
[[43 13  3]
 [13 36  7]
 [ 7  5 42]]




# Decision Tree

In [27]:
param_grid = {
    "dt__max_depth": list(range(3, 11)) + [None],  # Integer range from 3 to 10, plus None for no limit
    "dt__min_samples_split": list(range(2, 21)),  # Integer range from 2 to 20
    "dt__min_samples_leaf": list(range(1, 11)),   # Integer range from 1 to 10
    "dt__criterion": ["gini", "entropy"]          # Keep these as discrete choices
}

# Define the pipeline with DecisionTreeClassifier
dt_pipeline = Pipeline(
    [("preprocessing", ct),
     ("dt", DecisionTreeClassifier())]
)

# Set up GridSearchCV for cross-validation and hyperparameter search
grid_search = GridSearchCV(dt_pipeline, param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV with your data
grid_search.fit(X, y)
evaluate_grid_search(grid_search, X, y)

Best Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 5, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 4}
Best Cross-validated Metric: 0.6157
Classification Report:
              precision    recall  f1-score   support

    Democrat       0.72      0.83      0.77        59
 Independent       0.73      0.68      0.70        56
  Republican       0.94      0.85      0.89        54

    accuracy                           0.79       169
   macro avg       0.80      0.79      0.79       169
weighted avg       0.79      0.79      0.79       169

Confusion Matrix:
[[49  8  2]
 [17 38  1]
 [ 2  6 46]]


In [121]:
final_predictions = pd.DataFrame(
    {"id_num": test_data['id_num'],
    "political_affiliation_predicted": final_model_fit.predict(test_data)}
)
