In [1]:
import pandas as pd
with pd.HDFStore("store.h5") as store:
    df = store['ddf']

In [8]:
ddf = df.rename(columns={'class':'label'}).query('isEdited != 1 & label != -1')
print(ddf.columns)
print(ddf['label'].value_counts())

Index(['label', 'created_utc', 'gilded', 'isEdited', 'locked', 'm_esh',
       'm_info', 'm_nah', 'm_nta', 'm_yta', 'num_comments', 'over_18', 'score',
       'selftext', 'spoiler', 'sum', 't_esh', 't_info', 't_nah', 't_nta',
       't_yta', 'title', 'upvote_ratio'],
      dtype='object')
0.0    39845
3.0    14457
1.0     8882
2.0     4414
Name: label, dtype: int64


In [None]:
sel = ddf.drop(columns=['a_inf','sum','t_info','locked','gilded','score','num_comments','over_18','spoiler'])
sel.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
x = ddf.dropna(subset=['t_to']).query("(a_op == (t_op > 0.5)) & (a_to == (t_to >0.5))")
target = x[['a_op','a_to','t_op','t_to']]
data = x[['selftext','title']]
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(data, target, test_size=0.5, random_state=0)

Lesson: None of the variable other than upvote_ratio seem particularly correlated with judgemetn. Not sure how helpful these soft target variables are either. Edits also seem to generally suggest YTA. Will try again with better correlation indicator

In [None]:
ytrain[['t_op','t_to']]

In [None]:
def bin2cat(row):
    """Converts from the two binary labels back into the 4 post categories
    Assumes a_op is first, and a_to is second
    0 - NAH
    1 - NTA
    2 - YTA
    3 - ESH
    """
    return row[0] * 2 + row[1]

ytrain_cat = ytrain.apply(bin2cat,axis=1)
ytest_cat = ytest.apply(bin2cat,axis=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer()
tfv.fit(list(xtrain.selftext) + list(xtrain.title) + list(xtest.selftext) + list(xtest.title))

from scipy.sparse import hstack
xtrain_tfv = tfv.transform(xtrain.title) + tfv.transform(xtrain.selftext)
xtest_tfv = tfv.transform(xtest.title) + tfv.transform(xtest.selftext)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cfv = TfidfVectorizer()
cfv.fit(list(xtrain.selftext) + list(xtrain.title) + list(xtest.selftext) + list(xtest.title))

from scipy.sparse import hstack
xtrain_cfv = hstack((cfv.transform(xtrain.title),cfv.transform(xtrain.selftext)))
xtest_cfv = hstack((cfv.transform(xtest.title),cfv.transform(xtest.selftext)))

In [None]:
#from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

#soft classifier
sclf = MultiOutputRegressor(XGBRegressor())
sclf.fit(xtrain_tfv, ytrain[['t_op','t_to']])
hclf = OneVsRestClassifier(XGBRegressor())
hclf.fit(xtrain_tfv, ytrain[['a_op','a_to']])

#categories = np.apply_along_axis(bin2cat,1,np.around(predictions))
#print(sklearn.metrics.log_loss(ytest_cat,predictions))

In [None]:
soft_pred = sclf.predict(xtest_tfv)
hard_pred = hclf.predict(xtest_tfv)

In [None]:
import sklearn.metrics
import numpy as np
pdf = pd.DataFrame(soft_pred, columns=['a_op','a_to'])
print(pdf.describe())
pdf = pd.DataFrame(hard_pred, columns=['a_op','a_to'])
print(pdf.describe())

In [None]:
print(sklearn.metrics.roc_auc_score(ytest[['a_op','a_to']],soft_pred))
print(sklearn.metrics.roc_auc_score(ytest[['a_op','a_to']],hard_pred))

In [None]:
pred_bin = np.around(soft_pred)

import pprint
pprint.pprint(sklearn.metrics.classification_report(ytest[['a_op','a_to']],pred_bin,output_dict=True))

In [None]:
pprint.pprint(sklearn.metrics.classification_report(ytest[['a_op','a_to']],hard_pred,output_dict=True))

In [None]:
pdf = pd.DataFrame(pred_bin, columns=['a_op','a_to'])
print(pd.crosstab(pdf.a_op,pdf.a_to,normalize=True))
soft_pred_cat = pdf.apply(bin2cat,axis=1)
pdf = pd.DataFrame(hard_pred, columns=['a_op','a_to'])
print(pd.crosstab(pdf.a_op,pdf.a_to,normalize=True))
hard_pred_cat = pdf.apply(bin2cat,axis=1)

In [None]:
#    0 - NAH
#    1 - NTA
#    2 - YTA
#    3 - ESH

pred_cat = pdf.apply(bin2cat,axis=1)
sklearn.metrics.classification_report(ytest_cat,hard_pred_cat)

In [None]:
print(sklearn.metrics.balanced_accuracy_score(ytest_cat,hard_pred_cat))
print(sklearn.metrics.balanced_accuracy_score(ytest_cat,soft_pred_cat))