# CS5830 Final Project

## Ensemble Boosting & Bagging

## Imports

In [None]:
# Data Manipulation Imports
import pandas as pd
import numpy as np

# Plotting Imports
import seaborn as sns
import matplotlib.pyplot as plt


# Sklearn
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import compute_class_weight
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report, f1_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

# XG/Gamma
import xgboost as xgb

## Preprocessing

### Import Dataset

In [None]:
dating_df = pd.read_csv('./data/speeddating.csv')
display(dating_df.head())
print(f'Dataset Shape: {dating_df.shape}')

### Column Headers

In [None]:
for column in dating_df.columns:
   print(column)

### Repeating Info

In [None]:
dating_df = dating_df.drop(['has_null', 'd_age', 'd_d_age', 'samerace', 'd_importance_same_race', 'd_importance_same_religion', 'd_pref_o_attractive',
                            'd_pref_o_sincere', 'd_pref_o_intelligence', 'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests',
                            'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o', 'd_ambitous_o', 'd_shared_interests_o',
                            'd_attractive_important', 'd_sincere_important', 'd_intellicence_important', 'd_funny_important', 'd_ambtition_important',
                            'd_shared_interests_important', 'd_attractive', 'd_sincere', 'd_intelligence', 'd_funny', 'd_ambition', 'd_attractive_partner', 
                            'd_sincere_partner', 'd_intelligence_partner', 'd_funny_partner', 'd_ambition_partner', 'd_shared_interests_partner',
                            'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater',
                            'movies', 'concerts', 'music', 'shopping', 'yoga', 'd_sports', 'd_tvsports', 'd_exercise', 'd_dining', 'd_museums', 'd_art', 'd_hiking', 
                            'd_gaming', 'd_clubbing', 'd_reading', 'd_tv', 'd_theater', 'd_movies', 'd_concerts', 'd_music', 'd_shopping', 'd_yoga', 'd_interests_correlate', 
                            'd_expected_happy_with_sd_people', 'd_expected_num_interested_in_me', 'd_expected_num_matches', 'd_like', 'd_guess_prob_liked'
                            ], axis=1)
display(dating_df.shape)

### Null Values

In [None]:
print("Number of Null Values per Column:")
null_counts = dating_df.isnull().sum()
for col, count in null_counts.items():
    if count > 0:
       print(f"{col}: {count}")

print(f"\nNumber of Rows with NA values: {dating_df[dating_df.isnull().any(axis=1)].shape[0]}\n")


Can't drop samples with missing values as that would lead to a significant loss of data

Let's drop columns where there are over 1000 missing values and drop rows where the majority of the data is missing

In [None]:
drop = [col for col, count in null_counts.items() if count > 1000]
dating_df = dating_df.drop(columns=drop) # drop columns
print(f"\nNumber of Rows with NA values: {dating_df[dating_df.isnull().any(axis=1)].shape[0]}\n")

Imput the remaining missing values (using median)

In [None]:
X = dating_df.drop(['match', 'decision', 'decision_o'], axis=1, inplace=False)
y = dating_df['match']

matches = {"b'0'": 0, "b'1'": 1}

y = pd.DataFrame([matches[item] for item in y])

imputer = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imputer.fit_transform(X, y))

print(f"\nNumber of Rows with NA values: {X[X.isnull().any(axis=1)].shape[0]}\n")

### Feature Transformations 

In [None]:
for col in X.columns:
    if X[col].dtype == object:
      encoder = OrdinalEncoder()
      X[col] = encoder.fit_transform(X[[col]])
        
X.columns = dating_df.drop(['match', 'decision', 'decision_o'], axis=1, inplace=False).columns

In [None]:
X.head()

### Some Statistics

In [None]:
print(y.value_counts(normalize=True))
sns.countplot(data=y, x=y[0])
plt.title("Target Variable Distribution")
# plt.savefig("./figures/target-variable-dist.png")
plt.show()

In [None]:
print("Correlation with the target variable:")
print(X.corrwith(y[0]).sort_values(ascending=False))

### Training Preparation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
y_train = y_train.values.ravel()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train, y_train)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

## Base Models

In [None]:
base_models = {
   "models": [],
   "scores": []
}

In [None]:
from sklearn.linear_model import LogisticRegression

# model - Logistic Regression
model_lr = LogisticRegression(class_weight='balanced')
base_models['models'].append('LogisticRegression')

# train / cross-validation
cv_scores = cross_val_score(model_lr, X_train_scaled, y_train, cv=5, scoring='f1')
print("Cross-validation F1-scores:", cv_scores)
print("Average F1-score:", np.mean(cv_scores))
print()

# test
model_lr.fit(X_train_scaled, y_train)
y_pred = model_lr.predict(scaler.transform(X_test))
print("Logistic regression performance with class weights:")
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)
base_models['scores'].append(f1)

In [None]:
from sklearn.svm import SVC

# model - Support Vector Machine Classifier
model_svc = SVC(class_weight='balanced')
base_models['models'].append('SVC')

# train / cross-validation
cv_scores = cross_val_score(model_svc, X_train_scaled, y_train, cv=5, scoring='f1')
print("Cross-validation F1-scores:", cv_scores)
print("Average F1-score:", np.mean(cv_scores))
print()

# test
model_svc.fit(X_train_scaled, y_train)
y_pred = model_svc.predict(scaler.transform(X_test))
print("SVC performance with class weights:")
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-score: {f1:.4f}")
base_models['scores'].append(f1)

In [None]:
from sklearn.naive_bayes import GaussianNB

# model - Naive Bayes
model_nb = GaussianNB()
base_models['models'].append('GaussianNB')

# train / cross-validation
cv_scores = cross_val_score(model_nb, X_train_scaled, y_train, cv=5, scoring='f1')
print("Cross-validation F1-scores:", cv_scores)
print("Average F1-score:", np.mean(cv_scores))
print()

# test
model_nb.fit(X_train_scaled, y_train)
y_pred = model_nb.predict(scaler.transform(X_test))
print("Naive Bayes performance:")
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-score: {f1:.4f}")
base_models['scores'].append(f1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# model - K-Nearest Neighbors
model_knn = KNeighborsClassifier(n_neighbors=3)
base_models['models'].append('K-Nearest Neighbors')

# train / cross-validation
cv_scores = cross_val_score(model_knn, X_train_scaled, y_train, cv=5, scoring='f1')
print("Cross-validation F1-scores:", cv_scores)
print("Average F1-score:", np.mean(cv_scores))
print()

# test
model_knn.fit(X_train_scaled, y_train)
y_pred = model_knn.predict(scaler.transform(X_test))
print("K-Nearest Neighbors performance:")
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-score: {f1:.4f}")
base_models['scores'].append(f1)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# model - Decision Tree Classifier
model_dt = DecisionTreeClassifier(class_weight='balanced')
base_models['models'].append('Decision Tree')

# train / cross-validation
cv_scores = cross_val_score(model_dt, X_train_scaled, y_train, cv=5, scoring='f1')
print("Cross-validation F1-scores:", cv_scores)
print("Average F1-score:", np.mean(cv_scores))
print()

# test
model_dt.fit(X_train_scaled, y_train)
y_pred = model_dt.predict(scaler.transform(X_test))
print("Decision Tree performance with class weights:")
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)
base_models['scores'].append(f1)

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=base_models['models'], y=base_models['scores'])
plt.ylim(0.6, 1.0)
plt.savefig("./figures/base-model-scores.png")

## Boosted Ensembles

Note: AdaBoost - XGBoost - GammaBoost

### AdaBoost Logistic Regression

In [None]:
lr_ada = AdaBoostClassifier(model_lr, random_state=123)
lr_ada.fit(X_train, y_train)
lr_ada.score(X_test, y_test)

In [None]:
lr_ada = AdaBoostClassifier(model_lr, random_state=123, n_estimators=2)
lr_ada.fit(X_train, y_train)
lr_ada.score(X_test, y_test)

### Ada Boost Decision Tree

In [None]:
dt_ada = AdaBoostClassifier(model_dt, random_state=123)
dt_ada.fit(X_train, y_train)
dt_ada.score(X_test, y_test)

### Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
rf = RandomForestClassifier(n_estimators=5, n_jobs=-1, class_weight='balanced', )
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

### XGBoost

In [None]:
xgb_model_log = xgb.XGBClassifier(objective="binary:logistic", random_state=123)
xgb_model_log.fit(X_train, y_train)

xgb_model_log.score(X_test, y_test)

In [None]:
xgb_model_hinge = xgb.XGBClassifier(objective="binary:hinge", random_state=123)
xgb_model_hinge.fit(X_train, y_train)

xgb_model_hinge.score(X_test, y_test)

In [None]:
xgb_model_logitraw = xgb.XGBClassifier(objective="binary:logitraw", random_state=123)
xgb_model_logitraw.fit(X_train, y_train)

xgb_model_logitraw.score(X_test, y_test)

## Bagging Ensembles

Note: sklearn.ensemble.BaggingClassifier