In [1]:
%matplotlib inline

# Import for n gram
# Import statements
import pandas as pd
import numpy as np
import csv
import pdb
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for cool plotting
import re # for regular expression
import nltk # natural language processing toolkit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix, classification_report
from langdetect import detect, detect_langs # for language detection
# from tqdm.notebook import tqdm, trange
from tqdm import tqdm
import time
import math
from pathlib import Path
import pickle
from collections import Counter
import spacy
import argparse
from datetime import datetime
import torch

In [2]:
# Seeds for reproducibility
SEED = 7
torch.manual_seed(SEED)
np.random.seed(SEED)

In [3]:
# Load preprocessed datasets
# Temporary fix for case analysis....
train_df = pd.read_csv('../data/bnc/ca_splits/bnc_rb_ca_trainset_case_analysis.csv', encoding="utf-8")
val_df = pd.read_csv('../data/bnc/ca_splits/bnc_rb_ca_valset_case_analysis.csv', encoding="utf-8")
test_df = pd.read_csv('../data/bnc/ca_splits/bnc_rb_ca_testset_case_analysis.csv', encoding="utf-8")

# reset indices of subsets
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# rename column
train_df.rename(columns={'label': 'age_cat','age_cat' : 'label'}, inplace=True)
val_df.rename(columns={'label': 'age_cat','age_cat' : 'label'}, inplace=True)
test_df.rename(columns={'label': 'age_cat','age_cat' : 'label'}, inplace=True)

In [4]:
# Add preprocessing steps to produce column with clean_text including stopwords

df_list = [train_df, val_df, test_df]

for temp_df in df_list:

    temp_df['clean_text_ws'] = temp_df['text'] # uncomment this and comment line above if you want incl. non-alph chars

    # make all letters lowercase
    temp_df['clean_text_ws'] = temp_df['clean_text_ws'].apply(lambda x: x.lower())

    # remove whitespaces from beginning or ending
    temp_df['clean_text_ws'] = temp_df['clean_text_ws'].apply(lambda x: x.strip())

    print(f"Number of empty instanes: {len(temp_df[temp_df.clean_text_ws == ''])}")

Number of empty instanes: 0
Number of empty instanes: 0
Number of empty instanes: 0


In [5]:
# Evaluate performance
def print_evaluation_scores(labels, preds):
    print(f"Accuracy: {accuracy_score(labels, preds)}")
    print(f"F1 score: {f1_score(labels, preds, average = None)}") # outputs F1 per class
    print(f"Average precision: {average_precision_score(labels, preds, average = 'micro')}")
    print(f"Average recall: {recall_score(labels, preds, average = 'micro')}")
    print(classification_report(labels, preds, digits=5, zero_division=0))
    # print(f"Confusion Matrix: {confusion_matrix(labels.argmax(axis=1), preds.argmax(axis=1))}")
    
def print_top_n(vectorizer, clf, class_labels, n_feat = 10):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    
    for i, class_label in enumerate(class_labels):
#         pdb.set_trace()
        topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in topn)))

def most_informative_feature_for_class(vectorizer, classifier, class_labels, n=10):
    #labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        topn = sorted(zip(classifier.estimators_[i].coef_[0], feature_names))[-n:]
        
        for coef, feat in topn:
            print(class_label, feat, coef)
            
# class_labels_list = ['13-17', '23-27', '33-47']

In [6]:
# list1 = [21,2,3,4,5]
# np.argsort(list1)

In [7]:
def show_most_informative_features(vectorizer, clf, n=20):
    '''For binary case'''
    # column names and instantiation of subset output dataframe
    keys = ['19-29 n-gram', '19-29 coef.', '50-plus n-gram', "50-plus coef."]
    df = pd.DataFrame(columns=keys)
    
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        df.loc[0 if pd.isnull(df.index.max()) else df.index.max() + 1] = [fn_1] + [coef_1] + [fn_2] + [coef_2]
        
#     # Save dataframe to csv
#     df.to_csv(
#         f'mi_ngrams/bnc_trigram_{n}_mi_ngrams.csv',
#         index=False
#     )

In [8]:
def train_test_ngram(train_df, test_df, n_grams=[3], seeds=[SEED], dataset='bnc_rb'):

    overall_start_time = time.time()

    # results dict
    accs_all = {}
    if dataset == 'blog':
        class_labels_list = ['13-17', '23-27', '33-47']
    elif dataset == 'bnc' or dataset == 'bnc_rb':
        class_labels_list = ['19_29', '50_plus']

    test_accs = {}
    test_f1s = {}

    for n_gram in n_grams:
        test_accs[n_gram] = {}
        test_f1s[n_gram] = {}
        # for class_label in class_labels_list:
        #     test_f1s[n_gram][class_label] = {}

    print("Starting training and testing loops...")
    for seed in tqdm(seeds, desc = "Seed loop."):

        for n in tqdm(n_grams, desc = "n gram loop."):

            # Split data into features/ X and labels / Y
            # X = data['clean_data']
            # Y = data['labels']

            # n-gram model
            vectorizer = CountVectorizer(binary = True, ngram_range = (1, n))

            # fit vectorization model
            concat_df = pd.concat([train_df, test_df])
            X = vectorizer.fit_transform(concat_df['clean_text_ws']) # remove "_ws" to do analysis without stopwords

            X_train = X[:-len(test_df)]
            X_test = X[-len(test_df):]

            # Binarize the labels for prediction
            if dataset == 'blog':
                # binarizer = MultiLabelBinarizer(classes = sorted(label_counts.keys()))
                binarizer = LabelBinarizer()
            elif dataset == 'bnc_rb' or dataset == 'bnc':
                binarizer = LabelBinarizer()

            # Y = binarizer.fit_transform(train_df.age_cat)
            Y_train = train_df.label
            Y_test = test_df.label

            # label_counts.keys()

            # Split data into train and test sets
            # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size)

            # Fit logistic regression model
            start_time = time.time()
            model = LogisticRegression(solver = 'lbfgs', multi_class='ovr', max_iter = 1000000)
            model = OneVsRestClassifier(model)
            # model = MultiOutputClassifier(model)
            model.fit(X_train, Y_train)
            print(f"Fitting model took {time.time() - start_time} seconds.")

            # make predictions on test set
            Y_pred = model.predict(X_test)

            # Y_pred_inversed = binarizer.inverse_transform(Y_pred)
            # Y_test_inversed = binarizer.inverse_transform(Y_test)

            print("=" * 81)

            print(f"n = {n}")
            print(f"seed = {seed}")
            print_evaluation_scores(Y_test, Y_pred)

            test_accs[n][seed] = accuracy_score(Y_test, Y_pred)
            test_f1s[n][seed] = f1_score(Y_test, Y_pred, average=None)

            # for label_idx in range(len(class_labels_list)):
            #     test_f1s[n][class_labels_list[label_idx]][seed] = f1_score(Y_test, Y_pred, average=None)[label_idx]

            if n in accs_all:
                accs_all[n].append(accuracy_score(Y_test, Y_pred))
            else:
                accs_all[n] = [accuracy_score(Y_test, Y_pred)]

            # Print most informative features
            # if n == 1:
            #     print("Most informative features per age-group.")
            #     print_top_n_thresh(vectorizer = vectorizer, clf = model,
            #                 class_labels = class_labels_list, n_feat = 20)
            
            # Print most informative features
            print("Most informative features per age-group.")
#             print_top_n(vectorizer = vectorizer, clf = model, class_labels = class_labels_list, n_feat = 5)
            show_most_informative_features(vectorizer=vectorizer, clf=model, n=100)

            print("-" * 81)
    #         print("Some failure cases.")
    # #         predictions = model.predict(inputs)
    #         for i, (x, pred, label) in enumerate(zip(X_test, Y_pred, Y_test)):
    #             if (pred != label).any():
    #                 print(f"pred: {pred}")
    #                 print(f"label: {label}")
    #                 pred_cat = binarizer.classes_[np.where(pred == 1)[0][0]]
    #                 label_cat = binarizer.classes_[np.where(label == 1)[0][0]]
    #                 print(data['clean_data'][i], 'has been classified as ', pred_cat, 'and should be ', label_cat)

            print("=" * 81)


            int_labels = [label for label in range(len(class_labels_list))]
            cm = confusion_matrix(Y_test, Y_pred, labels=int_labels)
            # make_confusion_matrix(cf=cm, categories=class_labels_list, title=f'Confusion Matrix for {dataset} on Test set',
            #                       num_labels=int_labels, y_true=Y_test, y_pred=Y_pred, figsize=FIGSIZE)
            cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S')
            # plt.savefig(f"{FIGDIR}{dataset}/cm_{n}_gram_{dataset}_dt_{cur_datetime}.png",
            #             bbox_inches='tight')

    #         most_informative_feature_for_class(vectorizer = vectorizer, classifier = model, class_labels = class_labels_list, n=10)

#             df = pd.read_csv('bnc_rb_10p_testset_case_analysis.csv')

#             df.insert(len(df.columns), 'trigram_pred', Y_pred)

#             # Save dataframe to csv
#             df.to_csv(
#                 'bnc_rb_10p_testset_case_analysis.csv',
#                 index=False
#             )


    # print average metrics
    print(89*'-')
    print(89 * '-')
    print("PRINTING AVERAGE METRICS")
    for n_gram in n_grams:
        n_gram_accs = []
        n_gram_f1s = []
        for seed in seeds:
            n_gram_accs.append(test_accs[n_gram][seed])
            n_gram_f1s.append(test_f1s[n_gram][seed])

        print(f"| n = {n_gram} | Average accuracy = {np.mean(n_gram_accs)} | Acc std = {np.std(n_gram_accs)} "
              f"| Average f1s = {np.mean(n_gram_f1s, axis=0)} | F1s std = {np.std(n_gram_f1s, axis=0)} |")

    overall_end_time = time.time()


    print(f"Done with everything. Took {overall_end_time - overall_start_time} seconds.")


In [9]:
concat_train_df = pd.concat([train_df, val_df])
train_test_ngram(train_df=concat_train_df, test_df=test_df)

Seed loop.:   0%|          | 0/1 [00:00<?, ?it/s]
n gram loop.:   0%|          | 0/1 [00:00<?, ?it/s][A

Starting training and testing loops...
Fitting model took 31.423712968826294 seconds.
n = 3
seed = 7
Accuracy: 0.7167923657072495
F1 score: [0.72380666 0.70941251]
Average precision: 0.6570194087667262
Average recall: 0.7167923657072495
              precision    recall  f1-score   support

           0    0.70745   0.74093   0.72381      3254
           1    0.72710   0.69257   0.70941      3243

    accuracy                        0.71679      6497
   macro avg    0.71727   0.71675   0.71661      6497
weighted avg    0.71726   0.71679   0.71662      6497

Most informative features per age-group.




	-3.2079	um             		2.3738	yes            
	-2.8439	cool           		2.1240	you know       
	-2.5770	shit           		2.0938	wonderful      
	-2.1180	hmm            		1.9017	how weird      
	-2.0908	like           		1.8413	chinese        
	-2.0221	was like       		1.7360	right          
	-1.9588	love           		1.7095	building       
	-1.9582	as well        		1.6576	right right    
	-1.8822	as in          		1.5475	so erm         
	-1.8353	cute           		1.4347	mm mm          
	-1.8207	uni            		1.4079	cheers         
	-1.7928	massive        		1.3854	shed           
	-1.7898	wanna          		1.3696	pain           
	-1.7815	fuck           		1.3601	we know        
	-1.7219	tut            		1.3511	laugh yeah exactly
	-1.6155	fucking        		1.3471	ordinary       
	-1.5800	but yeah       		1.3194	mother         
	-1.5650	mhm            		1.3187	operation      
	-1.5577	basically      		1.3088	garden         
	-1.5320	jesus          		1.2988	ll be nice     
	-1.5269	grand   


n gram loop.: 100%|██████████| 1/1 [00:39<00:00, 39.43s/it][A
Seed loop.: 100%|██████████| 1/1 [00:39<00:00, 39.43s/it]

---------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
PRINTING AVERAGE METRICS
| n = 3 | Average accuracy = 0.7167923657072495 | Acc std = 0.0 | Average f1s = [0.72380666 0.70941251] | F1s std = [0. 0.] |
Done with everything. Took 39.438562870025635 seconds.





In [None]:
SEED

In [None]:
# def print_top_n(vectorizer, clf, class_labels, n_feat = 10):
#     """Prints features with the highest coefficient values, per class"""
#     feature_names = vectorizer.get_feature_names()
    
#     for i, class_label in enumerate(class_labels):
#         pdb.set_trace()
#         topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:]
#         print("%s: %s" % (class_label,
#               " ".join(feature_names[j] for j in topn)))

In [None]:
train_df.clean_text_ws

In [None]:
from nltk.corpus import stopwords
stopwords_dict = set(stopwords.words('english')) # use set (hash table) data structure for faster lookup

In [None]:
stopwords_dict