# Moral foudations Redidt corpus

In [1]:
import numpy as np
import pandas as pd
import spacy
import ast
from multiprocessing import Pool
from tqdm import tqdm
import json
import sklearn
import os
import swifter
nlp = spacy.load("en_core_web_md")

Package pickle5 becomes unnecessary in Python 3.8 and above. Its presence may confuse libraries including Ray. Please uninstall the package.


## Read data

In [2]:
mfrc = pd.read_csv("data/MFRC/final_mfrc_data.csv")
mfrc.head(1)

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence
0,That particular part of the debate is especial...,europe,French politics,annotator03,Non-Moral,Confident


- Make sentences unique
- Map moral domains to MFD (needs double-checking)

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stopwords = set(list(nltk_stopwords) + list(ENGLISH_STOP_WORDS) + list(STOP_WORDS))

def preprocess(texts, progress_bar=False):
    input_type = type(texts)
    if input_type == str: texts = [texts]
    tokens = []
    for doc in tqdm(nlp.pipe(texts, n_process=1), 
                    desc="Processed", disable=not progress_bar, 
                    dynamic_ncols=True, unit=" docs"):
        tokens_df = [[tok.text, tok.ent_type_, tok.tag_] for tok in doc]
        tokens_df = pd.DataFrame(tokens_df, columns=["token", "entity", "pos"])
        
        # Remove entities
        tokens_df = tokens_df[tokens_df.entity == ""]
        
        # Filter POS tags
        keep_pos = ['NN','NNS','JJ','VB','VBD','VBG','VBN','VBP','VBZ','RB']
        tokens_df = tokens_df[tokens_df.pos.isin(keep_pos)]
        
        # Remove bad characters
        no_chars = ["…","'ve","'s","'ll","'d","\"","'m","'s","'re","–-",'–-', '‘', '’d',
                    '’ll', '’m', '’re', '’s', '’ve', '“', ',,',',','(',')','.', '”', '\n\n',
                    "@realDonaldTrump","n't",'\xad']
        tokens_df = tokens_df[tokens_df.pos.isin(keep_pos)]
        
        # Remove stop words
        tokens_df = tokens_df[np.logical_not(tokens_df.token.isin(list(stopwords)))]
        
        # Remove non-alphabetic characters from each token
        tokens_df["token"] = tokens_df.token.apply(lambda x: "".join(filter(lambda c: c.isalpha(), x)))
        
        # Keep tokens with at least 3 characters
        tokens_df = tokens_df[tokens_df.token.apply(len) >= 3]
        
        # Lowercase tokens
        tokens_df["token"] = tokens_df.token.apply(str.lower)
    
        tokens.append(tokens_df.token.values.tolist())
    
    if input_type == str:
        return tokens[0]
    return tokens

In [4]:
if not os.path.isfile("data/corpora/MFRC_AITA_tokenized.csv"):
    mfrc_unique = pd.DataFrame()
    mfrc_unique["text"] = mfrc.text.unique()
    # Preprocess and tokenize
    mfrc_unique["tokens"] = mfrc_unique["text"].map(preprocess)
    
    for i in ["authority", "care", "fairness", "loyalty", "sanctity", "none"]:
        mfrc_unique[i] = 0
    mfrc_unique.set_index("text", inplace=True)

    key_map = {
        "Thin Morality": "none",
        "Non-Moral": "none",
        "Care": "care",
        "Purity": "sanctity",
        "Authority": "authority",
        "Loyalty": "loyalty",
        "Proportionality": "fairness",
        "Equality": "fairness"
    }
    for i, row in tqdm(mfrc.iterrows(), dynamic_ncols=True):
        text, fs = row["text"], row["annotation"].split(",")
        for f in fs:
            mfrc_unique.loc[text, key_map[f]] += 1

    # Save
    mfrc_unique["sentence"] = mfrc_unique.index
    mfrc_unique.index = range(len(mfrc_unique))
    mfrc_unique.to_csv("data/corpora/MFRC_AITA_tokenized.csv")
mfrc_unique = pd.read_csv("data/corpora/MFRC_AITA_tokenized.csv", index_col=0)
mfrc_unique["tokens"] = mfrc_unique["tokens"].map(ast.literal_eval)

## Predict using eMFD scoring

In [5]:
# Load eMFD lexicon
emfd = pd.read_csv("emfd/data/eMFD_wordlist.csv", index_col="word")
emfd.head(1)

Unnamed: 0_level_0,care_p,fairness_p,loyalty_p,authority_p,sanctity_p,care_sent,fairness_sent,loyalty_sent,authority_sent,sanctity_sent
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
brought,0.18,0.114286,0.08,0.096552,0.053333,-0.235404,-0.310015,-0.099783,-0.402207,-0.13255


In [6]:
def score(X):
    """
    Average the weights of all tokens with respect to each foundation.
    args:
        X: either a string or a list of tokens
    """
    foundations = ["authority_p", "care_p", "fairness_p", "loyalty_p", "sanctity_p"]
    if type(X) == str:
        X = preprocess(X)
    scores = pd.Series(0, index=foundations)
    count = 0
    for tok in X:
        if tok in emfd.index:
            count += 1
            scores[foundations] += emfd.loc[tok, foundations]
    scores.index = [s.split("_")[0] for s in scores.index]
    if count > 0:
        return scores / count
    return scores

In [7]:
# Predict on 
foundations = ["authority", "care", "fairness", "loyalty", "sanctity"]
scores_df = pd.DataFrame(0, index=mfrc_unique.index, columns=foundations)
scores_df["tokens"] = mfrc_unique["tokens"]
for i in tqdm(scores_df.index, dynamic_ncols=True):
    scores_df.loc[i, foundations] = score(scores_df.loc[i, "tokens"])

100%|██████████| 17886/17886 [04:24<00:00, 67.74it/s]


In [8]:
scores_df.head()

Unnamed: 0,authority,care,fairness,loyalty,sanctity,tokens
0,0.114044,0.094246,0.087567,0.102482,0.122693,"[particular, debate, especially, funny, explai..."
1,0.078366,0.098072,0.09454,0.086048,0.077889,"[pretty, lively, lingo, usually, deliberately,..."
2,0.0,0.0,0.0,0.0,0.0,"[closet, fascist, flamboyant, extroverted, fas..."
3,0.111523,0.093177,0.116527,0.09371,0.106972,"[unusual, situation, fillon, affair, influenci..."
4,0.127654,0.116325,0.105093,0.138541,0.129456,"[brand, conservatism, classical, right, wing, ..."


Save predictions

In [9]:
foundations = ["authority", "care", "fairness", "loyalty", "sanctity"]
for foundation in foundations:
    y_true = (mfrc_unique[foundation] > 0).astype(int).tolist()
    y_score = scores_df[foundation].tolist()
    # Naive thresholding
    y_pred = (y_score > np.median(y_score)).astype(int).tolist()
    
    evals = {
        "y_true": y_true,
        "y_pred": y_pred,
        "y_score": y_score
    }
    
    with open(f"data/MFRC/eval_results/emfd_{foundation}.json", "w") as f:
        json.dump(evals, f)

## Predict using logistic regression

Embed the dataset

In [10]:
import pickle
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    TfidfTransformer
)
from scipy.sparse import save_npz, load_npz

In [11]:
# tfidf
if not os.path.isfile("data/MFRC/embeddings/tfidf.npz"):
    sentences = mfrc.text.unique()
    tokens = mfrc_unique.tokens

    # tfidf
    with open("emfd/data/embeddings/tfidf_vocab.pkl", "rb") as f:
        tfidf_vocab = pickle.load(f)
    tfidf_vec = TfidfVectorizer(tokenizer=lambda tokens: tokens, 
                                lowercase=False, stop_words=None,
                                min_df=3, 
                                max_df=0.99,
                                vocabulary=tfidf_vocab
                                )
    X_tfidf = tfidf_vec.fit_transform(tokens)
    save_npz("data/MFRC/embeddings/tfidf.npz", X_tfidf)

In [12]:
# Spacy
if not os.path.isfile("data/MFRC/embeddings/spacy_300.npz"):
    X_spacy = np.zeros((len(sentences), 300), dtype=float)
    for i, doc in tqdm(enumerate(nlp.pipe(sentences, n_process=-1))):
        X_spacy[i, :] = doc.vector
    np.savez("data/MFRC/embeddings/spacy_300.npz", X_spacy)

In [13]:
# Glove
if not os.path.isfile("data/MFRC/embeddings/glove_twitter_200.npz"):
    from utils import make_one_concept
    from gensim.models import KeyedVectors
    # load the Stanford GloVe model
    glove_filename = "data/embeddings/glove.twitter.27B.200d"
    word2vec_output_file = glove_filename+'.word2vec'
    glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    X_glove = np.zeros((len(sentences), 200))
    for i, doc in tqdm(enumerate(nlp.pipe(sentences, n_process=-1)), 
                       desc="Processed", disable=False, 
                       dynamic_ncols=True, unit=" docs"):
        tokens = [tok.lemma_.lower() for tok in doc]
        X_glove[i] = make_one_concept(model=glove, word_list=tokens,
                                    normalize=True)
    np.savez("data/MFRC/embeddings/glove_twitter_200.npz", X_glove)
    del glove

In [14]:
if not os.path.isfile("data/MFRC/embeddings/sentence_roberta.npz"):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("stsb-roberta-large")
    model.to("cuda")

    X_bert = model.encode(sentences)
    np.savez("data/MFRC/embeddings/sentence_roberta.npz", X_bert)
    del model

Evaluate

In [15]:
import pickle
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    TfidfTransformer
)

with open("emfd/data/embeddings/tfidf_vocab.pkl", "rb") as f:
    tfidf_vocab = pickle.load(f)

embs = {
    "tfidf": load_npz("data/MFRC/embeddings/tfidf.npz"),
    "spacy": np.load("data/MFRC/embeddings/spacy_300.npz")["arr_0"],
    "bert": np.load("data/MFRC/embeddings/sentence_roberta.npz")["arr_0"],
    "glove": np.load("data/MFRC/embeddings/glove_twitter_200.npz")["arr_0"]
}
def get_model_and_data(emb_name, foundation):
    # Load model
    model_name = f"logreg_{emb_name}_{foundation}"
    model_path = f"emfd/data/sentence_classifiers/{model_name}.pkl"
    with open(model_path, "rb") as f:
        model = pickle.load(f)
    model.set_params(**{"n_jobs": -1})
    
    # Load data
    X = embs[emb_name]
    if foundation == "none":
        y = np.array(mfrc_unique[["care", "authority", "fairness", "loyalty", "sanctity"]].max(1) == 0, dtype=int)
    else:
        y = (mfrc_unique[foundation] > 0).astype(int)
    
    return model, X, y

In [16]:
foundations = ["authority", "care", "fairness", "loyalty", "sanctity", "none"]
emb_names = ["tfidf", "spacy", "bert", "glove"]
for emb_name in emb_names:
    for foundation in foundations:
        model, X, y = get_model_and_data(emb_name=emb_name, foundation=foundation)
        y_true = y.tolist()
        y_score = model.predict_proba(X)[:, 1].tolist()
        y_pred = model.predict(X).tolist()

        evals = {
            "y_true": y_true,
            "y_pred": y_pred,
            "y_score": y_score
        }
        
        model_name = f"logreg_{emb_name}_{foundation}"
        
        with open(f"data/MFRC/eval_results/{model_name}.json", "w") as f:
            json.dump(evals, f)