In [1]:
import numpy as np
import pandas as pd
import spacy
import ast
from multiprocessing import Pool
from tqdm import tqdm
import json

## Load the eMFD lexicon

In [2]:
# Load eMFD
emfd = pd.read_csv("data/eMFD_wordlist.csv", index_col="word")
emfd.head(1)

Unnamed: 0_level_0,care_p,fairness_p,loyalty_p,authority_p,sanctity_p,care_sent,fairness_sent,loyalty_sent,authority_sent,sanctity_sent
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
brought,0.18,0.114286,0.08,0.096552,0.053333,-0.235404,-0.310015,-0.099783,-0.402207,-0.13255


## Load the data

In [3]:
df = pd.read_csv("data/sentence_mf_counts.csv", index_col=0)
df["tokens"] = df["tokens"].map(ast.literal_eval)
df.head(1)

Unnamed: 0,sentence,authority,care,fairness,loyalty,sanctity,none,authority_seen,care_seen,fairness_seen,...,care_label,care_train,fairness_label,fairness_train,loyalty_label,loyalty_train,sanctity_label,sanctity_train,none_label,none_train
0,"Betsy DeVos, President Trump's nominee for edu...",1,1,4,0,2,2,3,1,4,...,1,1,1,1,-1,-1,1,1,0,1


In [4]:
def get_data(foundation, portion="train", return_index=False):
    if portion == "train":
        train_index = df[df[f"{foundation}_train"] == 1].index
        index = train_index
        X = df["tokens"][train_index]
        y = df[f"{foundation}_label"][train_index]
    elif portion == "test":
        test_index  = df[df[f"{foundation}_train"] == 0].index
        index = test_index
        X = df["tokens"][test_index]
        y = df[f"{foundation}_label"][test_index]
    else:
        all_index = df[(df[f"{foundation}_train"] == 0) |
                       (df[f"{foundation}_train"] == 1)].index
        index = all_index
        X = df["tokens"][all_index]
        y = df[f"{foundation}_label"][all_index]
    y = y.to_numpy()
    if return_index:
        return X, y, index
    return X, y

## Score text

In [5]:
from construct_emfd_data import preprocess, nlp
def score(X):
    """
    Average the weights of all tokens with respect to each foundation.
    args:
        X: either a string or a list of tokens
    """
    foundations = ["authority_p", "care_p", "fairness_p", "loyalty_p", "sanctity_p"]
    if type(X) == str:
        X = preprocess(X, nlp=nlp)
    scores = pd.Series(0, index=foundations)
    count = 0
    for tok in X:
        if tok in emfd.index:
            count += 1
            scores[foundations] += emfd.loc[tok, foundations]
    scores.index = [s.split("_")[0] for s in scores.index]
    if count > 0:
        return scores / count
    return scores

## Predict on eMFD sentences

No training needed, so prediction should be the same regardless.

In [6]:
# Predict on 
foundations = ["authority", "care", "fairness", "loyalty", "sanctity"]
scores_df = pd.DataFrame(0, index=df.index, columns=foundations)
scores_df["tokens"] = df["tokens"]
for i in tqdm(scores_df.index, dynamic_ncols=True):
    scores_df.loc[i, foundations] = score(scores_df.loc[i, "tokens"])

# def emfd_score_df(d):
#     print(d.index)
#     for i in d.index:
#         d.loc[i, foundations] = score(d.loc[i, "tokens"])

# def parallelize_dataframe(d, func, n_cores=40):
#     df_split = np.array_split(d, n_cores)
#     with Pool(n_cores) as pool:
#         d = pd.concat(pool.map(func, df_split))
# #         pool.close()
# #         pool.join()
#     return d

100%|██████████| 34262/34262 [06:45<00:00, 84.47it/s] 


In [7]:
scores_df.head(5)

Unnamed: 0,authority,care,fairness,loyalty,sanctity,tokens
0,0.112026,0.109768,0.140213,0.116041,0.09878,"[nominee, education, secretary, promised, enfo..."
1,0.103792,0.104476,0.125517,0.11524,0.10436,"[went, write, eager, bring, sense, urgency, en..."
2,0.109634,0.097379,0.126683,0.098084,0.084502,"[confirmation, hearing, point, suggested, stat..."
3,0.126447,0.117939,0.133769,0.100299,0.09603,"[disability, rights, advocates, upset, lack, u..."
4,0.104848,0.086508,0.104228,0.0989,0.087742,"[letter, emphasized, understands, federal, law..."


## Save predictions

In [8]:
foundations = ["authority", "care", "fairness", "loyalty", "sanctity"]
for foundation in foundations:
    _, y_train, train_index = get_data(foundation=foundation, portion="train", return_index=True)
    _, y_test, test_index = get_data(foundation=foundation, portion="test", return_index=True)
    y_score_train = scores_df[foundation][train_index]
    threshold = y_score_train.median()
    y_pred_train = (y_score_train > threshold).astype(int).tolist()
    y_score_test = scores_df[foundation][test_index]
    y_pred_test = (y_score_test > threshold).astype(int).tolist()
    
    train_eval = {
        "y_true": y_train.tolist(),
        "y_pred": y_pred_train,
        "y_score": y_score_train.tolist()
    }
    
    test_eval = {
        "y_true": y_test.tolist(),
        "y_pred": y_pred_test,
        "y_score": y_score_test.tolist()
    }
    
    with open(f"data/eval_results/emfd_{foundation}_train.json", "w") as f:
        json.dump(train_eval, f)
    
    with open(f"data/eval_results/emfd_{foundation}_test.json", "w") as f:
        json.dump(test_eval, f)