In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import classification_report, cohen_kappa_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB

from cleantext import clean

In [2]:
df_train = pd.read_csv('~/data/ynacc_proc/replicate/split/train.csv')
df_val = pd.read_csv('~/data/ynacc_proc/replicate/split/val.csv')
df_test = pd.read_csv('~/data/ynacc_proc/replicate/split/test.csv')

In [3]:
df_train['text'] = df_train['text'].apply(lambda x: clean(x, lower=True, no_line_breaks=True, zero_digits=True, fix_unicode=True, to_ascii=True))
df_val['text'] = df_val['text'].apply(lambda x: clean(x, lower=True, no_line_breaks=True, zero_digits=True, fix_unicode=True, to_ascii=True))
df_test['text'] = df_test['text'].apply(lambda x: clean(x, lower=True, no_line_breaks=True, zero_digits=True, fix_unicode=True, to_ascii=True))

In [44]:
df_train['text'] = df_train.apply(lambda x: x['text'] + ' .  ' + ('xxtoplevelcomment' if pd.isna(x['parentid']) else 'xxreplycomment'), axis=1)
df_val['text'] = df_val.apply(lambda x: x['text'] + ' .  ' + ('xxtoplevelcomment' if pd.isna(x['parentid']) else 'xxreplycomment'), axis=1)
df_test['text'] = df_test.apply(lambda x: x['text'] + ' .  ' + ('xxtoplevelcomment' if pd.isna(x['parentid']) else 'xxreplycomment'), axis=1)

In [4]:
text = df_train['text'].values
vectorizer = CountVectorizer()
vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
x_train = vectorizer.transform(text)
x_val = vectorizer.transform(df_val['text'].values)
x_test = vectorizer.transform(df_test['text'].values)

In [6]:
import numpy as np
# nb_classes = 6
# data = [[2, 3, 4, 0]]

def indices_to_one_hot(data, nb_classes):
    """Convert an iterable of indices to one-hot encoded labels."""
    targets = np.array(data).reshape(-1)
    return np.eye(nb_classes)[targets]

In [7]:
def run_cl(col, nb=False):
    y_train = df_train[col].values
    y_val = df_val[col].values
    y_test = df_test[col].values
    
    y_train = np.nan_to_num(y_train)
    y_val = np.nan_to_num(y_val)
    y_test = np.nan_to_num(y_test)
    
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(y_train.reshape(-1, 1))
    
    y_train = enc.transform(y_train.reshape(-1, 1)).toarray()
    y_val = enc.transform(y_val.reshape(-1, 1)).toarray()
    y_test = enc.transform(y_test.reshape(-1, 1)).toarray()
    
    if nb:
        clf = MultinomialNB()
    else:
        clf = RidgeCV()
        
    clf.fit(x_train, y_train)
    
    y_pred = np.argmax(clf.predict(x_val), axis=1)
    y_val = np.argmax(y_val, axis=1)
    
    dct = classification_report(y_pred=y_pred, y_true=y_val,output_dict=True)
    kappa = cohen_kappa_score(y_pred, y_val)

    y_pred_test = np.argmax(clf.predict(x_test), axis=1)
    y_test = np.argmax(y_test, axis=1)
    
    dct_test = classification_report(y_pred=y_pred_test, y_true=y_test, output_dict=True)
    kappa_test = cohen_kappa_score(y_pred_test, y_test)
    
    print(' & '.join([col[2:].title(), str(dct['micro avg']['f1-score']), str(dct['macro avg']['f1-score']), str(kappa), str(dct_test['micro avg']['f1-score']), str(dct_test['macro avg']['f1-score']), str(kappa_test)]) + ' \\\\')

In [10]:
cls = ['claudience', 'clpersuasive','clagreement','clinformative','clmean','clcontroversial', 'cldisagreement','cltopic', 'clsentiment']

In [11]:
for cl in cls:
    run_cl(cl)

Audience & 0.6981132075471698 & 0.5942166540116427 & 0.22810501767847735 & 0.7432188065099458 & 0.6351157949518605 & 0.2822383093853973 \\
Persuasive & 0.8559176672384219 & 0.6458694897604998 & 0.2983149931224208 & 0.840867992766727 & 0.6487063987063988 & 0.30709648023692904 \\
Agreement & 0.8799313893653516 & 0.4949757449757449 & 0.035817228181259875 & 0.8372513562386981 & 0.47681019258262547 & 0.010576120233788067 \\
Informative & 0.8181818181818182 & 0.5284035409035409 & 0.08715176223817556 & 0.8318264014466547 & 0.5106893106893107 & 0.03829683789292593 \\
Mean & 0.8130360205831904 & 0.6300869089406189 & 0.2801246105919003 & 0.8245931283905967 & 0.6104757132794516 & 0.22190632298118618 \\
Controversial & 0.6415094339622641 & 0.5740583433834967 & 0.1499738393386585 & 0.5352622061482821 & 0.5248140160823845 & 0.1587835238269989 \\
Disagreement & 0.6466552315608919 & 0.6393453453453454 & 0.2806349206349207 & 0.5949367088607594 & 0.5919894598155467 & 0.20082580645161285 \\
Topic & 0.663