# Papers Past Genre Classification
# Notebook 5B: Binary Classification (excluding TF-IDF)
---

This notebook trains and evaluates a number of classification models and feature sets using a binary classification approach where the target genre is given the outcome '1' and all other genres '0'. Each combination of genre and feature set is evaluated. 

The loaded dataset is the output of Notebook 3B: Linguistic Feature Extraction (excluding TF-IDF). A dataframe of the metrics for all combinations is returned and exported as a csv file for further analysis.  


In [1]:
import pandas as pd
import numpy as np
import pickle

from collections import Counter

# Features
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Classifiers
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Metrics/Evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from scipy import interp
from itertools import cycle

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from datetime import date
from datetime import datetime

In [2]:
filepath = '20220219_PP_3518articles_features_exclTFIDF.pkl'
features_df = pd.read_pickle(filepath)

In [3]:
len(features_df)

3518

In [4]:
# View the distribution of the genres
(features_df['genre'].value_counts() / len(features_df)).sort_values(ascending=False)

Report            0.284537
News              0.163729
Notice            0.162308
Opinion           0.138999
Feature           0.048891
LetterToEditor    0.047754
Results           0.041217
FamilyNotice      0.029278
Fiction           0.020182
Advertisement     0.018761
Review            0.018476
Poetry            0.009665
Obituary          0.009096
Speech            0.007106
Name: genre, dtype: float64

In [5]:
# List of genres
genre_list = ["Report", 
              "News", 
              "Notice", 
              "Opinion", 
              "Feature", 
              "LetterToEditor", 
              "Results", 
              "FamilyNotice", 
              "Advertisement", 
              "Fiction", 
              "Review", 
              "Obituary", 
              "Poetry", 
              "Speech"]

# Dictionary of feature groups
feature_dict = {'pos_freq_penn': ["nnps_freq", "vb_freq", "nn_freq", 
                                   "jj_freq", "cd_freq", "prp_freq", 
                                   "rb_freq", "cc_freq", "nnp_freq", 
                                   "vbd_freq", "vbz_freq", "stopword_freq"],
                'pos_freq_univ': ["propn_freq", "verb_freq", "noun_freq", 
                                   "adj_freq", "nums_freq", "pron_freq", 
                                   "stopword_freq"],
                'pos_freq_combo': ["nnps_freq", "vb_freq", "nn_freq", 
                                    "jj_freq", "cd_freq", "prp_freq", 
                                    "rb_freq", "cc_freq", "nnp_freq", 
                                    "vbd_freq", "vbz_freq", "propn_freq", 
                                    "verb_freq", "noun_freq", "adj_freq", 
                                    "nums_freq", "pron_freq", "stopword_freq"],
                'line_offsets': ["avg_line_offset", "max_line_offset"],
                'line_widths': ["avg_line_width", "min_line_width", 
                                "max_line_width", "line_width_range"],
                'syllable_freq': ["polysyll_freq", "monosyll_freq"],
                'text_stats': ["sentence_count", "word_count", 
                               "avg_word_length", "char_count"],
                'all_features': ["propn_freq", "verb_freq", "noun_freq", 
                                 "adj_freq", "nums_freq", "pron_freq", 
                                 "nnps_freq", "vb_freq", "nn_freq", 
                                 "jj_freq", "cd_freq", "prp_freq", "rb_freq", 
                                 "cc_freq", "nnp_freq", "vbd_freq", "vbz_freq", 
                                 "stopword_freq", "avg_line_offset", "max_line_offset", 
                                 "avg_line_width", "min_line_width", "max_line_width", 
                                 "line_width_range", "polysyll_freq", "monosyll_freq", 
                                 "sentence_count", "word_count", "avg_word_length", 
                                 "char_count"],
                'all_features_excl_penn': ["propn_freq", "verb_freq", "noun_freq", 
                                           "adj_freq", "nums_freq", "pron_freq", 
                                           "stopword_freq", "avg_line_offset", 
                                           "max_line_offset", "avg_line_width", 
                                           "min_line_width", "max_line_width", 
                                           "line_width_range", "polysyll_freq", 
                                           "monosyll_freq", "sentence_count", "word_count", 
                                           "avg_word_length", "char_count"],
                'all_features_excl_univ': ["nnps_freq", "vb_freq", "nn_freq", 
                                           "jj_freq", "cd_freq", "prp_freq", 
                                           "rb_freq", "cc_freq", "nnp_freq", 
                                           "vbd_freq", "vbz_freq", "stopword_freq", 
                                           "avg_line_offset", "max_line_offset", 
                                           "avg_line_width", "min_line_width", 
                                           "max_line_width", "line_width_range", 
                                           "polysyll_freq", "monosyll_freq", "sentence_count", 
                                           "word_count", "avg_word_length", "char_count"]
               }

In [6]:
# Dictionary of models

model_dict = {'Dummy' : DummyClassifier(random_state=3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log', class_weight="balanced"),
              'Random Forest': RandomForestClassifier(class_weight="balanced", random_state=3),
              'Decision Tree': DecisionTreeClassifier(class_weight="balanced", random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=3),
              'Gaussian Naive Bayes': GaussianNB(),
              'K Nearest Neighbor': KNeighborsClassifier(), 
              'Logistic Regression': LogisticRegression(class_weight="balanced", max_iter=500),
              'Support Vector Machine': SVC(kernel='rbf', class_weight="balanced", random_state=3)}

In [7]:
# Preliminary model evaluation using default parameters
# Code reference: https://medium.com/@robert.salgado/multiclass-text-classification-from-start-to-finish-f616a8642538

# Function to get the scores for each model in a df
def model_score_df(model_dict, X_train, X_test, y_train, y_test, genre, feature_set):
    """
    Given a dictionary of scikit learn models, 
    return and a dataframe of metrics ranked by 
    best AUROC score. 
    
    Option to print an AUROC chart for each model, feature set,
    and genre combination if required.
    """
    genre_list = []
    feature_set_list = []
    model_name = [] 
    ac_score_list = []
    p_score_list = []
    r_score_list = []
    f1_score_list = []
    auroc_score_list = []
    
    for k,v in model_dict.items():   
        genre_list.append(genre)
        feature_set_list.append(feature_set)
        model_name.append(k)
        pipe = make_pipeline(StandardScaler(), v) # data is scaled
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average = 'binary'))
        r_score_list.append(recall_score(y_test, y_pred, average = 'binary'))
        f1_score_list.append(f1_score(y_test, y_pred, average = 'binary'))
        auroc_score_list.append(roc_auc_score(y_test, y_pred))
        model_comparison_df = pd.DataFrame([genre_list, 
                                            feature_set_list, 
                                            model_name, 
                                            ac_score_list, 
                                            p_score_list, 
                                            r_score_list, 
                                            f1_score_list, 
                                            auroc_score_list]).T
        model_comparison_df.columns = ['genre', 
                                       'feature_set', 
                                       'model_name', 
                                       'accuracy_score', 
                                       'precision_score', 
                                       'recall_score', 
                                       'f1_score', 
                                       'auroc_score']
        model_comparison_df = model_comparison_df.sort_values(by='auroc_score', ascending=False)
#         RocCurveDisplay.from_predictions(y_test, y_pred)
#         plt.title(f"AUROC: {k}")
#         plt.show()
    
    return model_comparison_df

In [8]:
def genres_binary(df, genre_list, feature_dict, model_dict):
    """
    Given a list of genres and dictionaries
    of features and models, train and test binary classification
    models for each combination of genre and features and 
    return the dataframe of results.
    """
    dataframes = []
    
    # Append "binary_class" to each feature dictionary
    for k, v in feature_dict.items():
        v.append("binary_class")
    
    for genre in genre_list:
        temp_df = df  # Resets the dataframe for each run of the loop
        temp_df['binary_class'] = np.where(temp_df['genre']== genre, 1, 0)
        
        for k, v in feature_dict.items():
            model_df = temp_df.filter(v, axis=1)
            
            # Extract the explanatory variables in X and the target variable in y
            y = model_df.binary_class.copy()
            X = model_df.drop(["binary_class"], axis=1)
            
            #Train test split with stratified sampling for evaluation
            X_train, X_test, y_train, y_test = train_test_split(X, 
                                                                y, 
                                                                test_size = .3, 
                                                                shuffle = True, 
                                                                stratify = y, 
                                                                random_state = 3)
            
            model_results = model_score_df(model_dict, X_train, X_test, y_train, y_test, genre, k) 
            dataframes.append(model_results)
            # print(model_results)
        
        del temp_df, y, X, model_df, X_train, X_test, y_train, y_test  # Clear the dataframes from memory before next loop
        
    results_df = pd.concat(dataframes)
    
    return results_df

In [9]:
results_df = genres_binary(features_df, genre_list, feature_dict, model_dict)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [10]:
results_df = results_df.sort_values(by='auroc_score', ascending=False)
pd.set_option('display.max_rows', None)
display(results_df)

Unnamed: 0,genre,feature_set,model_name,accuracy_score,precision_score,recall_score,f1_score,auroc_score
8,Poetry,pos_freq_penn,Logistic Regression,0.960227,0.192308,1.0,0.322581,0.979924
8,Poetry,all_features_excl_penn,Logistic Regression,0.989583,0.473684,0.9,0.62069,0.94522
8,Fiction,all_features,Logistic Regression,0.982955,0.542857,0.904762,0.678571,0.944651
8,Fiction,all_features_excl_penn,Logistic Regression,0.979167,0.487179,0.904762,0.633333,0.942719
8,Fiction,all_features_excl_univ,Logistic Regression,0.97822,0.475,0.904762,0.622951,0.942236
6,Fiction,all_features_excl_penn,Gaussian Naive Bayes,0.969697,0.387755,0.904762,0.542857,0.937888
8,Poetry,pos_freq_univ,Logistic Regression,0.96875,0.219512,0.9,0.352941,0.934704
9,Fiction,pos_freq_univ,Support Vector Machine,0.950758,0.275362,0.904762,0.422222,0.928226
9,Fiction,all_features,Support Vector Machine,0.989583,0.692308,0.857143,0.765957,0.924707
9,Fiction,all_features_excl_penn,Support Vector Machine,0.987689,0.642857,0.857143,0.734694,0.923741


In [11]:
# Export the dataframe of results to a CSV file 
results_df.to_csv("20220219_PP_3518articles_binarymetrics_exclTFIDF.csv")