In [1]:
import pandas as pd
import numpy as np

In [2]:
SEED = 2020

In [3]:
corpus = pd.read_csv('data/final-corpus.csv')

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, test = train_test_split(corpus, test_size=0.2)

The corpus contains several comments from YouTube videos, and each comment may have one or more relevant aspects that it discusses. Each aspect is defined to also have a target.

In the CSV, each row consists of a target, and all aspects related to that target. Thus, the comments are repeated across rows.

In [6]:
corpus.head(20).comment

0     yes one thing if he is president of usa have b...
1     yes one thing if he is president of usa have b...
2     please safe this great nation and vote no to l...
3     elizabeth warren is so on point , she even has...
4     i hate it when candidates try to pretend to be...
5     i hate it when candidates try to pretend to be...
6     what presidents actually have control over is ...
7     warren is smelling corporate money , see how t...
8     i love her < 3 ive been feelin the bern since ...
9     i love her < 3 ive been feelin the bern since ...
10    you would think anyone claiming to be native a...
11                           `` persist '' go elizabeth
12    bernie served in the house from 1991 until the...
13    but she would n't let indians go to college as...
14    every democrat i 've met so far has some type ...
15    while warren and other democrats are f ' n aro...
16                                          crazy women
17    her credibility on her claims to native he

Thus, we can evaluate the model in several different ways 

- Can we predict any/all of the targets that the comment contains?
- Can we predict any/all of the aspects the comment talks about?
- Can we predict any/all of the sub-aspects the comment talks about?
- Can we predict (aspect, target) pairs?
- Can we predict (sub-aspect, target) pairs?

#### Implementing the evaluation subroutine

In [7]:
import itertools as it

def flatten(nested):
    return list(it.chain.from_iterable(nested))

In [8]:
def match_all(predictions, references):
    return set(predictions) == set(references)

def match_any(predictions, references):
    p = predictions
    r = references
    i = p.intersection(r)
    return len(i) > 0

In [9]:
def custom_accuracy(predictions, gold_standards, must_match_all):
    """
    Calculates accuracy based on "any matches"/"all matches" rule.
    """
    scoring_function = match_all if must_match_all else match_any
    return np.mean([
        scoring_function(pred, golds)
        for pred, golds in zip(predictions, gold_standards)
    ])

In [10]:
def pos_neg_counts_comment(prediction, reference):
    tp = 0; fp = 0; fn = 0
    for p in prediction:
        if p not in reference:
            fp += 1
        else:
            tp += 1
    for r in reference:
        if r not in prediction:
            fn += 1
    return tp, fp, fn

def pos_neg_counts(predictions, references):
    tp = 0; fp = 0; fn = 0
    for i, (p, r) in enumerate(zip(predictions, references)):
        _tp, _fp, _fn = pos_neg_counts_comment(p, r)
        tp += _tp
        fp += _fp
        fn += _fn
    return tp, fp, fn

def precision(tp, fp, fn):
    return tp/(tp + fp)

def recall(tp, fp, fn):
    return tp/(fp + fn)

def f1(p, r):
    num = 2*p*r
    den = p+r
    return num/den

def fdr(tp, fp, fn):
    return fp/(fp+tp)

In [11]:
def evaluate(predictions, gold_standards, should_print=False):
    """
    predictions: iterable of iterables of predictions
    gold_standards: iterable of iterables of gold standard values
    """
    any_acc = custom_accuracy(predictions, gold_standards, must_match_all=False)
    all_acc = custom_accuracy(predictions, gold_standards, must_match_all=True)
    tp, fp, fn = pos_neg_counts(predictions, gold_standards)
    prec = precision(tp, fp, fn)
    rec = recall(fp, tp, fn)
    fdr_score = fdr(tp, fp, fn) 
    f1_score = f1(prec, rec)

    if should_print:
        print('"Match any" accuracy: {}'.format(round(any_acc, 3)))
        print('"Match all" acc: {}'.format(round(all_acc, 3)))
        print('Precision: {}'.format(round(prec, 3)))
        print('Recall: {}'.format(round(rec, 3)))
        print('F1 score: {}'.format(round(f1_score, 3)))
        print('FDR: {}'.format(round(fdr_score, 3)))
    return any_acc, all_acc

### Preliminaries: Constructing the gold standard

In [12]:
def construct_gold_standard_exp1(data):
    out = []
    for _, target_series in data.groupby('comment_id').target:
        out.append(set(target_series))
    return out

In [13]:
def construct_predictions(y_pred, comment_id):
    df = pd.DataFrame()
    df['comment_id'] = comment_id
    df['y_pred'] = y_pred
    out= []
    for _, pred_series in df.groupby('comment_id').y_pred:
        out.append(set(pred_series))
    return out

In [14]:
exp1_gold_train = construct_gold_standard_exp1(train)
exp1_gold_test = construct_gold_standard_exp1(test)

### Preliminaries: Constructing BoW Features

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer()

In [17]:
X_train = cv.fit_transform(train.comment)
X_test = cv.transform(test.comment)

y_train = train.target
y_test = test.target

### Experiment 1: Can we identify the target

Here we could try a simple logistic regression at first

In [18]:
from sklearn.linear_model import LogisticRegression

#### Logistic Regression

In [19]:
logreg = LogisticRegression(max_iter=1000)

In [20]:
logreg.fit(X=X_train, y=y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
y_pred_train = logreg.predict(X_train)

In [22]:
y_pred_test = logreg.predict(X_test)

In [23]:
exp1_pred_train = construct_predictions(y_pred_train, train.comment_id)

In [24]:
exp1_pred_test = construct_predictions(y_pred_test, test.comment_id)

In [25]:
print('Training set:')
_ = evaluate(exp1_pred_train, exp1_gold_train, should_print=True)

Training set:
"Match any" accuracy: 0.998
"Match all" acc: 0.943
Precision: 0.998
Recall: 0.002
F1 score: 0.005
FDR: 0.002


In [26]:
print('Test set:')
_ = evaluate(exp1_pred_test, exp1_gold_test, should_print=True)

Test set:
"Match any" accuracy: 0.593
"Match all" acc: 0.574
Precision: 0.593
Recall: 0.396
F1 score: 0.475
FDR: 0.407


#### RandomForest

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
mlb = MultiLabelBinarizer()

In [30]:
mlb.fit(exp1_gold_train + exp1_gold_test)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [31]:
X_train_unique = cv.transform(train.comment.unique())
X_test_unique = cv.transform(test.comment.unique())

In [32]:
y_train_unique = mlb.transform(exp1_gold_train)
y_test_unique = mlb.transform(exp1_gold_test)

In [33]:
exp1_rf_y_train = mlb.fit

In [34]:
rf = RandomForestClassifier()

In [35]:
rf.fit(X_train_unique, y_train_unique)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [36]:
y_pred_unique_train = rf.predict(X_train_unique)
y_pred_unique_test = rf.predict(X_test_unique)

In [37]:
exp1_rf_pred_train = [set(t) for t in mlb.inverse_transform(y_pred_unique_train)]
exp1_rf_pred_test = [set(t) for t in mlb.inverse_transform(y_pred_unique_test)]

In [38]:
print('Training set:')
_ = evaluate(exp1_rf_pred_train, exp1_gold_train, should_print=True)

Training set:
"Match any" accuracy: 0.995
"Match all" acc: 0.995
Precision: 1.0
Recall: 0.0
F1 score: 0.0
FDR: 0.0


In [39]:
print('Training set:')
_ = evaluate(exp1_rf_pred_test, exp1_gold_test, should_print=True)

Training set:
"Match any" accuracy: 0.13
"Match all" acc: 0.111
Precision: 0.333
Recall: 0.252
F1 score: 0.287
FDR: 0.667


Why is this performing so horribly?

In [40]:
exp1_rf_pred_test

[set(),
 set(),
 set(),
 set(),
 {'yang'},
 {'warren'},
 {'sanders'},
 {'biden'},
 set(),
 {'biden'},
 set(),
 set(),
 set(),
 set(),
 {'sanders', 'warren'},
 {'yang'},
 {'warren'},
 set(),
 set(),
 set(),
 set(),
 set(),
 {'biden'},
 {'warren'},
 {'biden'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'biden'},
 set(),
 {'biden'},
 set(),
 {'biden'},
 set(),
 set(),
 {'warren'},
 set(),
 set(),
 set(),
 {'warren'},
 {'buttigieg'},
 set(),
 set(),
 {'biden'},
 set(),
 set(),
 set(),
 set(),
 {'warren'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'yang'},
 {'biden'},
 set(),
 {'yang'},
 set(),
 set(),
 {'biden'},
 set(),
 set(),
 {'biden'},
 {'biden'},
 {'warren'},
 set(),
 {'biden'},
 set(),
 set(),
 {'yang'},
 {'buttigieg'},
 {'biden'},
 {'buttigieg'},
 {'biden'},
 {'biden'},
 set(),
 set(),
 set(),
 set(),
 {'biden'},
 set(),
 set(),
 set(),
 set(),
 {'biden', 'yang'},
 set(),
 set(),
 set(),
 {'warren'},
 set(),
 set(),
 set(),
 se

A-ha, a lot of the predictions are empty!

### Experiment 1.5: Can we jointly predict (target, general) ?

In [41]:
def construct_gold_standard_exp15(data):
    out= []
    for cid, row in train.groupby('comment_id'):
        tgts = row.target.tolist()
        gens = row.general.tolist()
        out.append({f"{tgt}_{gen}" for tgt, gen in zip(tgts, gens)})
    return out

In [42]:
exp15_gold_train = construct_gold_standard_exp15(train)
exp15_gold_test = construct_gold_standard_exp15(test)

#### Logistic Regression

In [43]:
logreg = LogisticRegression(max_iter=1000)

In [44]:
y_train_exp15 = train.apply(lambda row: f"{row.target}_{row.general}", axis=1)

In [45]:
y_test_exp15 = test.apply(lambda row: f"{row.target}_{row.general}", axis=1)

In [46]:
logreg.fit(X_train, y=y_train_exp15)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
y_pred_train_exp15 = logreg.predict(X_train)

In [48]:
y_pred_test_exp15 = logreg.predict(X_test)

In [49]:
exp15_pred_train = construct_predictions(y_pred_train_exp15, train.comment_id)

In [50]:
exp15_pred_test = construct_predictions(y_pred_test_exp15, test.comment_id)

In [51]:
print('Training set:')
_ = evaluate(exp15_pred_train, exp15_gold_train, should_print=True)

Training set:
"Match any" accuracy: 0.988
"Match all" acc: 0.933
Precision: 0.988
Recall: 0.012
F1 score: 0.023
FDR: 0.012


In [52]:
print('Test set:')
_ = evaluate(exp15_pred_test, exp15_gold_test, should_print=True)

Test set:
"Match any" accuracy: 0.12
"Match all" acc: 0.111
Precision: 0.12
Recall: 0.826
F1 score: 0.21
FDR: 0.88


Well this clearly isn't learning anything!

In [53]:
for comm, pred, gold in zip(test.comment.unique(), exp15_pred_test, exp15_gold_test):
    print(f'Predicted: {pred}\nActual: {gold}\n')

Predicted: {'sanders_positive'}
Actual: {'sanders_positive'}

Predicted: {'warren_negative'}
Actual: {'warren_negative'}

Predicted: {'warren_negative'}
Actual: {'warren_negative'}

Predicted: {'yang_positive'}
Actual: {'warren_positive'}

Predicted: {'warren_negative'}
Actual: {'warren_negative'}

Predicted: {'sanders_positive'}
Actual: {'sanders_positive', 'warren_positive'}

Predicted: {'warren_negative'}
Actual: {'warren_positive'}

Predicted: {'warren_negative'}
Actual: {'sanders_positive'}

Predicted: {'warren_negative'}
Actual: {'warren_negative'}

Predicted: {'warren_negative'}
Actual: {'warren_negative'}

Predicted: {'warren_negative'}
Actual: {'warren_negative'}

Predicted: {'warren_negative'}
Actual: {'warren_negative'}

Predicted: {'sanders_positive'}
Actual: {'warren_negative'}

Predicted: {'warren_positive'}
Actual: {'warren_negative'}

Predicted: {'warren_positive'}
Actual: {'warren_negative'}

Predicted: {'biden_negative'}
Actual: {'warren_negative'}

Predicted: {'warre

#### RandomForest

In [54]:
rf = RandomForestClassifier()

In [55]:
mlb = MultiLabelBinarizer()

In [56]:
mlb.fit(exp15_gold_train + exp15_gold_test)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [57]:
y_train_unique_exp15 = mlb.transform(exp15_gold_train)
y_test_unique_exp15 = mlb.transform(exp15_gold_test)

In [58]:
rf.fit(X_train_unique, y_train_unique_exp15)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [59]:
y_pred_unique_train_exp15 = rf.predict(X_train_unique)
y_pred_unique_test_exp15 = rf.predict(X_test_unique)

In [60]:
exp15_rf_pred_train = [set(t) for t in mlb.inverse_transform(y_pred_unique_train_exp15)]
exp15_rf_pred_test = [set(t) for t in mlb.inverse_transform(y_pred_unique_test_exp15)]

In [61]:
print('Training set:')
_ = evaluate(exp15_rf_pred_train, exp15_gold_train, should_print=True)

Training set:
"Match any" accuracy: 0.995
"Match all" acc: 0.995
Precision: 1.0
Recall: 0.0
F1 score: 0.0
FDR: 0.0


In [62]:
print('Test set:')
_ = evaluate(exp15_rf_pred_test, exp15_gold_test, should_print=True)

Test set:
"Match any" accuracy: 0.028
"Match all" acc: 0.019
Precision: 0.097
Recall: 0.243
F1 score: 0.138
FDR: 0.903


As we can see, yet again a gross overfit!

### Experiment 2: Can we predict the aspects and their polarity

In [63]:
POLICY_COLUMNS = ['policy_race', 'appeal_white', 'policy_unspecified', 'policy_other',
       'policy_international', 'policy_healthcare', 'policy_economy',
       'campaign_prospects', 'policy_education', 'policy_lgbt', 'appeal_old',
       'appeal_african', 'appeal_democrat', 'appeal_other', 'appeal_young',
       'appeal_unspecified', 'appeal_female', 'appeal_asian',
       'appeal_hispanic']

In [64]:
def construct_gold_standard_exp2(data):
    out = []
    for cid, comment_df in train.fillna('').groupby('comment_id'):
        asp = []
        for col in POLICY_COLUMNS:
            pol = comment_df[col].tolist()
            for p in pol:
                if p != "":
                    asp.append(f"{col}_{p}")
        if len(asp) == 0:
            asp.append('general_only')
        out.append(asp)
    return out

In [65]:
exp2_gold_train = construct_gold_standard_exp2(train)

In [66]:
exp2_gold_test = construct_gold_standard_exp2(test)

#### Logistic Regression

Won't work here, since we truly have multi-label data that can't be converted properly!

#### Random Forest Classifier

In [67]:
rf = RandomForestClassifier()

In [68]:
mlb = MultiLabelBinarizer()

In [69]:
mlb.fit(exp2_gold_train + exp2_gold_test)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [70]:
y_train_unique_exp2 = mlb.transform(exp2_gold_train)
y_test_unique_exp2 = mlb.transform(exp2_gold_test)

In [71]:
rf.fit(X_train_unique, y_train_unique_exp2)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [72]:
y_pred_unique_train_exp2 = rf.predict(X_train_unique)
y_pred_unique_test_exp2 = rf.predict(X_test_unique)

In [73]:
exp2_rf_pred_train = [set(t) for t in mlb.inverse_transform(y_pred_unique_train_exp2)]
exp2_rf_pred_test = [set(t) for t in mlb.inverse_transform(y_pred_unique_test_exp2)]

In [74]:
print('Training set:')
_ = evaluate(exp2_rf_pred_train, exp2_gold_train, should_print=True)

Training set:
"Match any" accuracy: 0.998
"Match all" acc: 0.998
Precision: 1.0
Recall: 0.0
F1 score: 0.0
FDR: 0.0


In [75]:
print('Test set:')
_ = evaluate(exp2_rf_pred_test, exp2_gold_test, should_print=True)

Test set:
"Match any" accuracy: 0.694
"Match all" acc: 0.694
Precision: 0.682
Recall: 0.304
F1 score: 0.421
FDR: 0.318


### Experiment 3: Can we predict (candidate, aspect) pairs

In [78]:
def construct_gold_standard_exp3(data):
    out = []
    for cid, comment_df in train.fillna('').groupby('comment_id'):
        asp = []
        for _, row in comment_df.iterrows():
            tgt = row['target']
            pols = [row[c] for c in POLICY_COLUMNS]
            if all([p == "" for p in pols]):
                asp.append((tgt, 'general_only'))
            else:
                for c in POLICY_COLUMNS:
                    pol = row[c]
                    if pol != "":
                        asp.append((tgt, f"{c}_{pol}"))
        out.append(asp)
    return out

In [79]:
exp3_gold_train = construct_gold_standard_exp3(train)

In [80]:
exp3_gold_test = construct_gold_standard_exp3(test)

#### RandomForest Classifier

In [81]:
mlb = MultiLabelBinarizer()

In [82]:
mlb.fit(exp3_gold_train + exp3_gold_test)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [83]:
y_train_unique_exp3 = mlb.transform(exp3_gold_train)
y_test_unique_exp3 = mlb.transform(exp3_gold_test)

In [84]:
rf.fit(X_train_unique, y_train_unique_exp3)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [85]:
y_pred_unique_train_exp3 = rf.predict(X_train_unique)
y_pred_unique_test_exp3 = rf.predict(X_test_unique)

In [86]:
exp3_rf_pred_train = [set(t) for t in mlb.inverse_transform(y_pred_unique_train_exp3)]
exp3_rf_pred_test = [set(t) for t in mlb.inverse_transform(y_pred_unique_test_exp3)]

In [87]:
print('Training set:')
_ = evaluate(exp3_rf_pred_train, exp3_gold_train, should_print=True)

Training set:
"Match any" accuracy: 0.998
"Match all" acc: 0.998
Precision: 0.998
Recall: 0.002
F1 score: 0.004
FDR: 0.002


In [88]:
print('Test set:')
_ = evaluate(exp3_rf_pred_test, exp3_gold_test, should_print=True)

Test set:
"Match any" accuracy: 0.037
"Match all" acc: 0.028
Precision: 0.121
Recall: 0.234
F1 score: 0.16
FDR: 0.879


In line with previous findings, a crazy overfit!