In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.metrics as skm
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import joblib
import os

# General setup and helper methods

In [2]:
df = pd.read_csv("../data/preprocessed/reviewFeatures.csv")

In [3]:
"""
  Load target features on y and drop target features on X.
  Perform shuffle on first split to prevent bias. Split 80-20 for train-test.
  Split again on train to get validation set. So 60-20-20.
  Stratifies all subsets to ensure equal samples for each class.
"""
def train_test_valid_split(df, targets):
  y = df[targets]
  X = df.drop(targets, axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=101, shuffle=True, stratify=y)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=101, shuffle=True, stratify=y_train)

  return X_train, X_test, X_val, y_train, y_test, y_val

In [4]:
"""
  Merges training and validation data.
"""
def concat_train_valid(X_train, y_train, X_val, y_val):
  X_train = pd.concat([X_train, X_val])
  y_train = pd.concat([y_train, y_val])

  return X_train, y_train

In [5]:
"""
    Saves the best model as joblib file.
"""
def save_model(model):
    best_model = model.best_estimator_

    model_fp = os.path.join("../models", "best_gbc_model.joblib")

    joblib.dump(best_model, model_fp)

    print(f"Saved best Gradient-Boosted Tree Classifer to {model_fp}")

    return best_model    

In [6]:
"""
    Prints the classification report for training, test, and validation only if hold-out was used.
    Also prints out the most important features and the number of features retained after feature selection if grid search was used.
"""
def printResults(classifer, X_train, X_test, X_val, y_train, y_test, y_val, grid_search=False, param_grid=None):
    train_pred = classifer.predict(X_train)
    print(f'Training Report')
    print(classification_report(y_train, train_pred))

    train_report = classification_report(y_train, train_pred, output_dict=True)

    print(f"Training Accuracy {train_report['accuracy']:.4f}")
    print(f"Training F1-Score: {train_report['macro avg']['f1-score']:.4f}\n")  

    if grid_search == False:
        val_pred = classifer.predict(X_val)    
        print(f'Validation Report')
        print(classification_report(y_val, val_pred))

        val_report = classification_report(y_test, val_pred, output_dict=True)
        print(f"Validation Accuracy {val_report['accuracy']:.4f}")
        print(f"Validation F1-Score: {val_report['macro avg']['f1-score']:.4f}\n")    

    test_pred = classifer.predict(X_test)
    print(f'Test Report')
    print(classification_report(y_test, test_pred))

    test_report = classification_report(y_test, test_pred, output_dict=True)
    print(f"Test Accuracy {test_report['accuracy']:.4f}")
    print(f"Test F1-Score: {test_report['macro avg']['f1-score']:.4f}\n")      

    if grid_search == True and param_grid is not None:
        fs = classifer.named_steps['fs']
        feature_retained = fs.get_support()
        feature_names = X_train.columns[feature_retained]

        clf = classifer.named_steps['classifer']

        most_important_features = pd.DataFrame({
            'Feature': feature_names,
            'Importance': clf.feature_importances_
        }).sort_values(by='Importance', ascending=False)

        print(f'Number of features retained: {len(feature_names)}')

        print("Most Important Features: ")
        print(most_important_features.head(10))       

In [7]:
"""
    Trains a Gradient Boosted ensemble model by training weak decision trees to produce residuals to reduce the residuals of the main strong model.
    First, performs a train-test-valid split of 60-20-20. Then performs feature selection using feature importance and a threshold value. 
    Features with absolute importance values less than the threshold value are discarded, while the rest are retained.
    If grid search flag is true and a parameter grid has been specified, then merge the train and validation subsets and run grid search.
    Then train the classifer and the results gets printed out via the printResults function.
    Additionally, provided grid search is enabled, print the best model parameters and score and save the best model.
"""
def GBClassification(df, grid_search=False, param_grid=None):
    X_train, X_test, X_val, y_train, y_test, y_val = train_test_valid_split(df, "Real=1/Fake=0")

    # print("Training set distribution: ")
    # print(f'{y_train.value_counts()}')

    # print("Validation set distribution: ")
    # print(f'{y_val.value_counts()}')       

    # print("Test set distribution: ")
    # print(f'{y_test.value_counts()}')     

    pipeline = Pipeline([
        ('fs', SelectFromModel(estimator=GradientBoostingClassifier(random_state=101, learning_rate=0.01, max_features='sqrt', subsample=0.1))), 
        ('classifer', GradientBoostingClassifier(random_state=101, subsample=0.1))
    ])

    if grid_search == True and param_grid is not None:
        X_train, y_train = concat_train_valid(X_train, y_train, X_val, y_val)

        # print("Merged Training set distribution: ")
        # print(f'{y_train.value_counts()}')

        pipeline = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring="f1")

    pipeline.fit(X_train, y_train)

    if grid_search == True and param_grid is not None:
        print(f'Best Model Parameters: {pipeline.best_params_}')
        print(f'Best Model Score: {pipeline.best_score_}')

        pipeline = save_model(pipeline)

    printResults(pipeline, X_train, X_test, X_val, y_train, y_test, y_val, grid_search=grid_search, param_grid=param_grid)
         

In [8]:
param_grid = {
    'fs__threshold': ['mean', 'median', '1.25*mean'],
    'classifer__n_estimators': [50, 100, 200],
    'classifer__learning_rate': [0.001, 0.01, 0.1],
    'classifer__max_depth': [3, 5, 7],
    'classifer__min_samples_split': [5, 10, 20],
    'classifer__min_samples_leaf': [2, 4, 10],
    'classifer__subsample': [0.1, 0.4, 0.5, 0.8],
    'classifer__max_features': [None, 'sqrt']
}

# Hold-Out Method (Feature Selection)

In [9]:
GBClassification(df)

Training Report
              precision    recall  f1-score   support

           0       0.78      0.59      0.67        78
           1       0.67      0.83      0.74        78

    accuracy                           0.71       156
   macro avg       0.72      0.71      0.71       156
weighted avg       0.72      0.71      0.71       156

Training Accuracy 0.7115
Training F1-Score: 0.7072

Validation Report
              precision    recall  f1-score   support

           0       0.67      0.62      0.64        26
           1       0.64      0.69      0.67        26

    accuracy                           0.65        52
   macro avg       0.65      0.65      0.65        52
weighted avg       0.65      0.65      0.65        52

Validation Accuracy 0.5769
Validation F1-Score: 0.5763

Test Report
              precision    recall  f1-score   support

           0       0.65      0.77      0.70        26
           1       0.71      0.58      0.64        26

    accuracy                

# 5-fold CV (Feature Selection)

In [10]:
GBClassification(df, grid_search=True, param_grid=param_grid)

Best Model Parameters: {'classifer__learning_rate': 0.1, 'classifer__max_depth': 3, 'classifer__max_features': 'sqrt', 'classifer__min_samples_leaf': 2, 'classifer__min_samples_split': 10, 'classifer__n_estimators': 50, 'classifer__subsample': 0.5, 'fs__threshold': 'median'}
Best Model Score: 0.6846819546819546
Saved best Gradient-Boosted Tree Classifer to ../models/best_gbc_model.joblib
Training Report
              precision    recall  f1-score   support

           0       0.77      0.84      0.80       104
           1       0.82      0.75      0.78       104

    accuracy                           0.79       208
   macro avg       0.80      0.79      0.79       208
weighted avg       0.80      0.79      0.79       208

Training Accuracy 0.7933
Training F1-Score: 0.7929

Test Report
              precision    recall  f1-score   support

           0       0.66      0.81      0.72        26
           1       0.75      0.58      0.65        26

    accuracy                          