# Papers Past Genre Classification
# Notebook 6: Multiclass Classification
---

This notebook trains and evaluates a number of classification models and feature sets using a multiclass classification approach where each genre is encoded with a number. The loaded dataset is the output of Notebook 3: Linguistic Feature Extraction. A dataframe of the metrics for all combinations is returned and exported as a csv file for further analysis.  

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Features
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Classifiers
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Metrics/Evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import RocCurveDisplay
from scipy import interp
from itertools import cycle

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from datetime import date
from datetime import datetime

In [2]:
filepath = '20220113_PP_3518articles_withfeatures.pkl'
features_df = pd.read_pickle(filepath)

In [3]:
len(features_df)

3518

In [4]:
# View the distribution of the genres
(features_df['genre'].value_counts() / len(features_df)).sort_values(ascending=False)

Report            0.284537
News              0.163729
Notice            0.162308
Opinion           0.138999
Feature           0.048891
LetterToEditor    0.047754
Results           0.041217
FamilyNotice      0.029278
Fiction           0.020182
Advertisement     0.018761
Review            0.018476
Poetry            0.009665
Obituary          0.009096
Speech            0.007106
Name: genre, dtype: float64

In [5]:
# Turn labels into numbers

LE = LabelEncoder()
features_df['label_num'] = LE.fit_transform(features_df['genre'])
display(features_df.groupby(['genre', 'label_num'])['label_num'].count())

genre           label_num
Advertisement   0              66
FamilyNotice    1             103
Feature         2             172
Fiction         3              71
LetterToEditor  4             168
News            5             576
Notice          6             571
Obituary        7              32
Opinion         8             489
Poetry          9              34
Report          10           1001
Results         11            145
Review          12             65
Speech          13             25
Name: label_num, dtype: int64

In [6]:
# Dictionary of feature groups
feature_dict = {'pos_freq_penn': ["nnps_freq", "vb_freq", "nn_freq", "jj_freq", 
                                   "cd_freq", "prp_freq", "rb_freq", "cc_freq", 
                                   "nnp_freq", "vbd_freq", "vbz_freq", "stopword_freq"],
                'pos_freq_univ': ["propn_freq", "verb_freq", "noun_freq", 
                                   "adj_freq", "nums_freq", "pron_freq", 
                                   "stopword_freq"],
                'pos_freq_combo': ["nnps_freq", "vb_freq", "nn_freq", "jj_freq", 
                                    "cd_freq", "prp_freq", "rb_freq", "cc_freq", 
                                    "nnp_freq", "vbd_freq", "vbz_freq", "propn_freq", 
                                    "verb_freq", "noun_freq", "adj_freq", "nums_freq", 
                                    "pron_freq", "stopword_freq"],
                'line_offsets': ["avg_line_offset", "max_line_offset"],
                'line_widths': ["avg_line_width", "min_line_width", 
                                "max_line_width", "line_width_range"],
                'syllable_freq': ["polysyll_freq", "monosyll_freq"],
                'tf_idf': ["tf_idf_sum"],
                'text_stats': ["sentence_count", "word_count", 
                               "avg_word_length", "char_count"],
                'all_features': ["propn_freq", "verb_freq", "noun_freq", 
                                 "adj_freq", "nums_freq", "pron_freq", 
                                 "nnps_freq", "vb_freq", "nn_freq", "jj_freq", 
                                 "cd_freq", "prp_freq", "rb_freq", "cc_freq", 
                                 "nnp_freq", "vbd_freq", "vbz_freq", "stopword_freq", 
                                 "avg_line_offset", "max_line_offset", "avg_line_width", 
                                 "min_line_width", "max_line_width", "line_width_range", 
                                 "polysyll_freq", "monosyll_freq", "sentence_count", "word_count", 
                                 "avg_word_length", "char_count", "tf_idf_sum"],
                'all_features_excl_penn': ["propn_freq", "verb_freq", 
                                           "noun_freq", "adj_freq", 
                                           "nums_freq", "pron_freq", 
                                           "stopword_freq", "avg_line_offset", 
                                           "max_line_offset", "avg_line_width", 
                                           "min_line_width", "max_line_width", 
                                           "line_width_range", "polysyll_freq", 
                                           "monosyll_freq", "sentence_count", 
                                           "word_count", "avg_word_length", 
                                           "char_count", "tf_idf_sum"],
                'all_features_excl_univ': ["nnps_freq", "vb_freq", "nn_freq", 
                                           "jj_freq", "cd_freq", "prp_freq", 
                                           "rb_freq", "cc_freq", "nnp_freq", 
                                           "vbd_freq", "vbz_freq", "stopword_freq", 
                                           "avg_line_offset", "max_line_offset", 
                                           "avg_line_width", "min_line_width", 
                                           "max_line_width", "line_width_range", 
                                           "polysyll_freq", "monosyll_freq", 
                                           "sentence_count", "word_count", 
                                           "avg_word_length", "char_count", "tf_idf_sum"],
                'all_features_excl_tfidf': ["nnps_freq", "vb_freq", "nn_freq", 
                                            "jj_freq", "cd_freq", "prp_freq", 
                                            "rb_freq", "cc_freq", "nnp_freq", 
                                            "vbd_freq", "vbz_freq", "stopword_freq", 
                                            "avg_line_offset", "max_line_offset", 
                                            "avg_line_width", "min_line_width", "max_line_width", 
                                            "line_width_range", "polysyll_freq", "monosyll_freq", 
                                            "sentence_count", "word_count", "avg_word_length", 
                                            "char_count"]
               }

In [7]:
# Dictionary of models

model_dict = {'Dummy' : DummyClassifier(random_state=3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest': RandomForestClassifier(random_state=3),
              'Decision Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=3),
              'Gaussian Naive Bayes': GaussianNB(),
              'K Nearest Neighbor': KNeighborsClassifier(), 
              'Support Vector Machine': SVC(kernel='rbf', random_state=3)}

In [8]:
# Preliminary model evaluation using default parameters
# Code reference: https://medium.com/@robert.salgado/multiclass-text-classification-from-start-to-finish-f616a8642538

# Function to get the scores for each model in a df
def model_score_df(model_dict, X_train, X_test, y_train, y_test, feature_set):
    """
    Given a dictionary of scikit learn models, 
    return a dataframe of metrics ranked by 
    best F1 score.
    """
    feature_set_list = []
    model_name = []
    ac_score_list = []
    p_score_list = []
    r_score_list = []
    f1_score_list = []
    
    for k,v in model_dict.items():   
        feature_set_list.append(feature_set)
        model_name.append(k)
        pipe = make_pipeline(StandardScaler(), v) # data is scaled
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='weighted')) # Setting average = weighted to account for label imbalance
        r_score_list.append(recall_score(y_test, y_pred, average = 'weighted'))
        f1_score_list.append(f1_score(y_test, y_pred, average = 'weighted'))
        model_comparison_df = pd.DataFrame([feature_set_list, 
                                            model_name, 
                                            ac_score_list, 
                                            p_score_list, 
                                            r_score_list, 
                                            f1_score_list]).T
        model_comparison_df.columns = ['feature_set',
                                       'model_name', 
                                       'accuracy_score', 
                                       'precision_score', 
                                       'recall_score', 
                                       'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    
    return model_comparison_df

In [9]:
def genres_multiclass(df, feature_dict, model_dict):
    """
    Given a list of genres and dictionaries
    of features and models, train and test multiclass classification
    models for each feature set and 
    return the dataframe of metrics.
    """
    dataframes = []
    
    # Append "label_num" to each feature dictionary
    for k, v in feature_dict.items():
        v.append("label_num")
      
        model_df = df.filter(v, axis=1)

        # Extract the explanatory variables in X and the target variable in y
        y = model_df.label_num.copy()
        X = model_df.drop(["label_num"], axis=1)

        #Train test split with stratified sampling for evaluation
        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                            y, 
                                                            test_size = .3, 
                                                            shuffle = True, 
                                                            stratify = y, 
                                                            random_state = 3)
        
        model_results = model_score_df(model_dict, X_train, X_test, y_train, y_test, k) 
        dataframes.append(model_results)

        del y, X, model_df, X_train, X_test, y_train, y_test  # Clear the dataframes from memory before next loop
    
    results_df = pd.concat(dataframes)
    
    return results_df
        

In [10]:
results_df = genres_multiclass(features_df, feature_dict, model_dict)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
results_df = results_df.sort_values(by='f1_score', ascending=False)
pd.set_option('display.max_rows', None)
display(results_df)

Unnamed: 0,feature_set,model_name,accuracy_score,precision_score,recall_score,f1_score
8,all_features,Support Vector Machine,0.657197,0.64861,0.657197,0.631282
2,all_features,Random Forest,0.647727,0.645352,0.647727,0.616029
8,all_features_excl_tfidf,Support Vector Machine,0.644886,0.639994,0.644886,0.614483
2,all_features_excl_univ,Random Forest,0.642045,0.619666,0.642045,0.609163
8,all_features_excl_univ,Support Vector Machine,0.641098,0.627716,0.641098,0.60872
2,all_features_excl_tfidf,Random Forest,0.636364,0.627656,0.636364,0.606634
8,pos_freq_combo,Support Vector Machine,0.621212,0.606818,0.621212,0.586599
2,pos_freq_combo,Random Forest,0.607955,0.588445,0.607955,0.581277
8,pos_freq_penn,Support Vector Machine,0.611742,0.599626,0.611742,0.580534
8,all_features_excl_penn,Support Vector Machine,0.608902,0.586388,0.608902,0.578717


In [12]:
# Export the dataframe of results to a CSV file 
results_df.to_csv("20220126_PP_3518articles_metrics_multiclass.csv")