# Moral foudations Redidt corpus

In [12]:
import numpy as np
import pandas as pd
import spacy
import ast
from multiprocessing import Pool
from tqdm import tqdm
import json
import sklearn
import os

nlp = spacy.load("en_core_web_md")

## Read data

Ensure that the `final_mfrc_data.csv` file is in the `data` folder.

In [17]:
mfrc = pd.read_csv("data/final_mfrc_data.csv")
mfrc.head(1)

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence
0,That particular part of the debate is especial...,europe,French politics,annotator03,Non-Moral,Confident


- Make sentences unique
- Map moral domains to MFD (needs double-checking)

In [None]:
if True:#  not os.path.isfile("data/mfrc_preprocessed.csv"):
    mfrc_unique = pd.DataFrame()
    mfrc_unique["text"] = mfrc.text.unique()
    
    for i in ["authority", "care", "fairness", "loyalty", "sanctity", "none"]:
        mfrc_unique[i] = 0
    mfrc_unique.set_index("text", inplace=True)

    key_map = {
        "Thin Morality": "none",
        "Non-Moral": "none",
        "Care": "care",
        "Purity": "sanctity",
        "Authority": "authority",
        "Loyalty": "loyalty",
        "Proportionality": "fairness",
        "Equality": "fairness"
    }
    for i, row in tqdm(mfrc.iterrows(), dynamic_ncols=True,
                       total=len(mfrc)):
        text, fs = row["text"], row["annotation"].split(",")
        for f in fs:
            mfrc_unique.loc[text, key_map[f]] += 1
            

    # Save
    mfrc_unique["text"] = mfrc_unique.index
    mfrc_unique.index = range(len(mfrc_unique))
    mfrc_unique["tokens"] = mfrc_unique["text"].map(lambda y: [x.text.lower().strip() for x in nlp(y)
                                                             if x.text.lower().strip().isalpha() and
                                                             len(x.text.lower().strip()) >= 3])
    mfrc_unique.to_csv("data/mfrc_preprocessed.csv")

100%|███████████████████████████████████████████████████████████████████| 61226/61226 [00:24<00:00, 2516.14it/s]


## Score using MFD

In [8]:
mfrc = pd.read_csv("data/mfrc_preprocessed.csv", index_col=0)

In [11]:
lexicons = ["mfd", "mfd2", "emfd"]
scores = {}
for lexicon in lexicons:
    scores[lexicon] = pd.read_csv(f"data/mfrc_scoring_results/mfrc_{lexicon}.csv", index_col=0)
foundations = ["authority", "care", "fairness", "loyalty", "sanctity"]
for lexicon in lexicons:
    for foundation in foundations:
        y = (mfrc[foundation] > 0).astype(int)
        scores_df = scores[lexicon]
        y_score = scores_df[foundation]
        y_pred = np.array(y_score >= y_score.median(), dtype=int)
        
        eval = {
            "y_true": y.tolist(),
            "y_pred": y_pred.tolist(),
            "y_score": y_score.tolist()
        }

        with open(f"data/mfrc_scoring_results/by_foundation/{lexicon}_{foundation}.json", "w") as f:
            json.dump(eval, f)

## Predict using logistic regression

Embed the dataset

In [10]:
import pickle
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    TfidfTransformer
)
from scipy.sparse import save_npz, load_npz

In [11]:
# tfidf
if not os.path.isfile("data/MFRC/embeddings/tfidf.npz"):
    sentences = mfrc.text.unique()
    tokens = mfrc_unique.tokens

    # tfidf
    with open("emfd/data/embeddings/tfidf_vocab.pkl", "rb") as f:
        tfidf_vocab = pickle.load(f)
    tfidf_vec = TfidfVectorizer(tokenizer=lambda tokens: tokens, 
                                lowercase=False, stop_words=None,
                                min_df=3, 
                                max_df=0.99,
                                vocabulary=tfidf_vocab
                                )
    X_tfidf = tfidf_vec.fit_transform(tokens)
    save_npz("data/MFRC/embeddings/tfidf.npz", X_tfidf)

In [12]:
# Spacy
if not os.path.isfile("data/MFRC/embeddings/spacy_300.npz"):
    X_spacy = np.zeros((len(sentences), 300), dtype=float)
    for i, doc in tqdm(enumerate(nlp.pipe(sentences, n_process=-1))):
        X_spacy[i, :] = doc.vector
    np.savez("data/MFRC/embeddings/spacy_300.npz", X_spacy)

In [13]:
# Glove
if not os.path.isfile("data/MFRC/embeddings/glove_twitter_200.npz"):
    from utils import make_one_concept
    from gensim.models import KeyedVectors
    # load the Stanford GloVe model
    glove_filename = "data/embeddings/glove.twitter.27B.200d"
    word2vec_output_file = glove_filename+'.word2vec'
    glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    X_glove = np.zeros((len(sentences), 200))
    for i, doc in tqdm(enumerate(nlp.pipe(sentences, n_process=-1)), 
                       desc="Processed", disable=False, 
                       dynamic_ncols=True, unit=" docs"):
        tokens = [tok.lemma_.lower() for tok in doc]
        X_glove[i] = make_one_concept(model=glove, word_list=tokens,
                                    normalize=True)
    np.savez("data/MFRC/embeddings/glove_twitter_200.npz", X_glove)
    del glove

In [14]:
if not os.path.isfile("data/MFRC/embeddings/sentence_roberta.npz"):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("stsb-roberta-large")
    model.to("cuda")

    X_bert = model.encode(sentences)
    np.savez("data/MFRC/embeddings/sentence_roberta.npz", X_bert)
    del model

Evaluate

In [15]:
import pickle
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    TfidfTransformer
)

with open("emfd/data/embeddings/tfidf_vocab.pkl", "rb") as f:
    tfidf_vocab = pickle.load(f)

embs = {
    "tfidf": load_npz("data/MFRC/embeddings/tfidf.npz"),
    "spacy": np.load("data/MFRC/embeddings/spacy_300.npz")["arr_0"],
    "bert": np.load("data/MFRC/embeddings/sentence_roberta.npz")["arr_0"],
    "glove": np.load("data/MFRC/embeddings/glove_twitter_200.npz")["arr_0"]
}
def get_model_and_data(emb_name, foundation):
    # Load model
    model_name = f"logreg_{emb_name}_{foundation}"
    model_path = f"emfd/data/sentence_classifiers/{model_name}.pkl"
    with open(model_path, "rb") as f:
        model = pickle.load(f)
    model.set_params(**{"n_jobs": -1})
    
    # Load data
    X = embs[emb_name]
    if foundation == "none":
        y = np.array(mfrc_unique[["care", "authority", "fairness", "loyalty", "sanctity"]].max(1) == 0, dtype=int)
    else:
        y = (mfrc_unique[foundation] > 0).astype(int)
    
    return model, X, y

In [16]:
foundations = ["authority", "care", "fairness", "loyalty", "sanctity", "none"]
emb_names = ["tfidf", "spacy", "bert", "glove"]
for emb_name in emb_names:
    for foundation in foundations:
        model, X, y = get_model_and_data(emb_name=emb_name, foundation=foundation)
        y_true = y.tolist()
        y_score = model.predict_proba(X)[:, 1].tolist()
        y_pred = model.predict(X).tolist()

        evals = {
            "y_true": y_true,
            "y_pred": y_pred,
            "y_score": y_score
        }
        
        model_name = f"logreg_{emb_name}_{foundation}"
        
        with open(f"data/MFRC/eval_results/{model_name}.json", "w") as f:
            json.dump(evals, f)