# CS5830 Final Project

## Ensemble Boosting & Bagging

## Imports

In [None]:
# Data Manipulation Imports
import pandas as pd
import numpy as np

# Plotting Imports
import seaborn as sns
import matplotlib.pyplot as plt


# Sklearn
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import compute_class_weight
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report, f1_score

## Preprocessing

### Import Dataset

In [None]:
dating_df = pd.read_csv('./data/speeddating.csv')
display(dating_df.head())
print(f'Dataset Shape: {dating_df.shape}')

### Column Headers

In [None]:
for column in dating_df.columns:
   print(column)

### Null Values

In [None]:
print("Number of Null Values per Column:")
null_counts = dating_df.isnull().sum()
for col, count in null_counts.items():
    if count > 0:
       print(f"{col}: {count}")

print(f"\nNumber of Rows with NA values: {dating_df[dating_df.isnull().any(axis=1)].shape[0]}\n")

In [None]:
dating_df.dropna().shape

In [None]:
# Column is empty
dating_df = dating_df.drop('has_null', axis=1)

Can't drop samples with missing values as that would lead to a significant loss of data

Let's drop columns where there are over 1000 missing values and drop rows where the majority of the data is missing

In [None]:
drop = [col for col, count in null_counts.items() if count > 1000]
print(drop)
dating_df = dating_df.drop(columns=drop) # drop columns
dating_df = dating_df.dropna(subset=['sports']) # drop samples

Imput the remaining missing values (using median)

In [None]:
null_counts = dating_df.isnull().sum()

for col, count in null_counts.items():
    if count > 0:
        print(f"{col}: {count}")
        median = dating_df[col].median()
        dating_df[col] = dating_df[col].fillna(median)

In [None]:
dating_df.isna().sum()

In [None]:
dating_df.shape

### Feature Transformations 

In [None]:
for col in dating_df.columns:
    if dating_df[col].dtype == object:
      encoder = OrdinalEncoder()
      dating_df[col] = encoder.fit_transform(dating_df[[col]])

In [None]:
dating_df.head()

### Some Statistics

In [None]:
target = 'match'
X = dating_df.drop([target], axis=1)
y = dating_df[target]

In [None]:
print(y.value_counts(normalize=True))
sns.countplot(x=y)
plt.title("Target Variable Distribution")
plt.show()

In [None]:
corr = dating_df.corr()
print("Correlation with the target variable:")
print(corr['match'].sort_values(ascending=False))

### Training Preparation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

## Base Models

Note: LogisticRegression - SVC - Naive Bayes - Decision Tree - KNeighborsClassifier - Neural Net

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# model
lr = LogisticRegression(class_weight={0: class_weights[0], 1: class_weights[1]})

# train / cross-validation
cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1_macro')
print("Cross-validation F1-scores:", cv_scores)
print("Average F1-score:", np.mean(cv_scores))
print()

# test
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(scaler.transform(X_test))
print("Logistic regression performance with class weights:")
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)

### High Bias Models

#### Low Scores

#### High Scores

### High Variance Models

#### Low Scores

#### High Scores

## Boosted Ensembles

Note: AdaBoost - XGBoost - GammaBoost

## Bagging Ensembles

Note: sklearn.ensemble.BaggingClassifier