In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import classification_report, cohen_kappa_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB

from cleantext import clean

In [2]:
df_train = pd.read_csv('~/data/ynacc_proc/replicate/split/train.csv')
df_val = pd.read_csv('~/data/ynacc_proc/replicate/split/val.csv')
df_test = pd.read_csv('~/data/ynacc_proc/replicate/split/test.csv')

In [3]:
df_train['text'] = df_train['text'].apply(lambda x: clean(x, lower=True, no_line_breaks=True, zero_digits=True, fix_unicode=True, to_ascii=True))
df_val['text'] = df_val['text'].apply(lambda x: clean(x, lower=True, no_line_breaks=True, zero_digits=True, fix_unicode=True, to_ascii=True))
df_test['text'] = df_test['text'].apply(lambda x: clean(x, lower=True, no_line_breaks=True, zero_digits=True, fix_unicode=True, to_ascii=True))

In [7]:
df_train['text'] = df_train.apply(lambda x: x['text'] + ' .  ' + ('xxtoplevelcomment' if pd.isna(x['parentid']) else 'xxreplycomment'), axis=1)
df_val['text'] = df_val.apply(lambda x: x['text'] + ' .  ' + ('xxtoplevelcomment' if pd.isna(x['parentid']) else 'xxreplycomment'), axis=1)
df_test['text'] = df_test.apply(lambda x: x['text'] + ' .  ' + ('xxtoplevelcomment' if pd.isna(x['parentid']) else 'xxreplycomment'), axis=1)

In [4]:
text = df_train['text'].values
vectorizer = CountVectorizer()
vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
x_train = vectorizer.transform(text)
x_val = vectorizer.transform(df_val['text'].values)
x_test = vectorizer.transform(df_test['text'].values)

In [6]:
import numpy as np
# nb_classes = 6
# data = [[2, 3, 4, 0]]

def indices_to_one_hot(data, nb_classes):
    """Convert an iterable of indices to one-hot encoded labels."""
    targets = np.array(data).reshape(-1)
    return np.eye(nb_classes)[targets]

In [7]:
def run_cl(col, nb=False):
    y_train = df_train[col].values
    y_val = df_val[col].values
    y_test = df_test[col].values
    
    y_train = np.nan_to_num(y_train)
    y_val = np.nan_to_num(y_val)
    y_test = np.nan_to_num(y_test)
    
#     enc = OneHotEncoder(handle_unknown='ignore')
#     enc.fit(y_train.reshape(-1, 1))
    
#     y_train = enc.transform(y_train.reshape(-1, 1)).toarray()
#     y_val = enc.transform(y_val.reshape(-1, 1)).toarray()
#     y_test = enc.transform(y_test.reshape(-1, 1)).toarray()
    
    if nb:
        clf = MultinomialNB()
    else:
        clf = RidgeCV()
        
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_val)

#     y_pred = np.argmax(clf.predict(x_val), axis=1)
#     y_val = np.argmax(y_val, axis=1)
    
    dct = classification_report(y_pred=y_pred, y_true=y_val,output_dict=True)
    kappa = cohen_kappa_score(y_pred, y_val)

#     y_pred_test = np.argmax(clf.predict(x_test), axis=1)
#     y_test = np.argmax(y_test, axis=1)
    y_pred_test = clf.predict(x_test)

    dct_test = classification_report(y_pred=y_pred_test, y_true=y_test, output_dict=True)
    kappa_test = cohen_kappa_score(y_pred_test, y_test)
    
    print(' & '.join([col[2:].title(), str(dct['micro avg']['f1-score']), str(dct['macro avg']['f1-score']), str(kappa), str(dct_test['micro avg']['f1-score']), str(dct_test['macro avg']['f1-score']), str(kappa_test)]) + ' \\\\')

In [8]:
cls = ['claudience', 'clpersuasive','clagreement','clinformative','clmean','clcontroversial', 'cldisagreement','cltopic', 'clsentiment']

In [9]:
for cl in cls:
    run_cl(cl, nb=True)

Audience & 0.6861063464837049 & 0.5866257511827132 & 0.20712103984125918 & 0.7197106690777576 & 0.6028200862799976 & 0.21824980619271295 \\
Persuasive & 0.8147512864493998 & 0.5446893439777855 & 0.09783356258596965 & 0.8245931283905967 & 0.6150489791524634 & 0.24018017762794452 \\
Agreement & 0.8850771869639794 & 0.497575406778571 & 0.046944003903867104 & 0.8517179023508138 & 0.504090113735783 & 0.07005455067470556 \\
Informative & 0.8181818181818182 & 0.5588378069674471 & 0.13699586638364425 & 0.8119349005424954 & 0.5342403628117914 & 0.0721177115936884 \\
Mean & 0.8130360205831904 & 0.6403128944434068 & 0.2964316161247108 & 0.8354430379746836 & 0.6510315176311501 & 0.3021840116480621 \\
Controversial & 0.6638078902229846 & 0.6396331617721263 & 0.28632457279904067 & 0.6039783001808319 & 0.6038954030319383 & 0.24514295330877545 \\
Disagreement & 0.6346483704974271 & 0.6265214606021782 & 0.2546770621387543 & 0.6401446654611211 & 0.6380568665822492 & 0.2892627635870443 \\
Topic & 0.68267