In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, ttest_rel, ttest_1samp, chi2_contingency, spearmanr, pearsonr
import seaborn as sns
import pickle
import json
from collections import Counter
from itertools import chain
from tqdm import tqdm
tqdm.pandas()
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
DATA_DIR = "../Data/"
PRE_LIWC_DIR = DATA_DIR + "all_subreddit_data_pre_liwc/"
LIWC_DIR = DATA_DIR + "liwc_data/"
TOXICITY_DIR = DATA_DIR + "toxicity_scores/"
TRAINING_DIR = DATA_DIR + "training_data/"
TESTING_DIR = DATA_DIR + "testing_data/"
PREDICTIONS_DIR = DATA_DIR + "regression_predictions/"
MATCHED_TESTING_AUTHORS_DIR = DATA_DIR + "matched_testing_authors/"
NO_MANO_PARENTS_DIR = DATA_DIR + "no_mano_parents_results/"

all_subreddits = ['news', 'askreddit', 'worldnews', 'todayilearned', 'askmen', 'movies', 'technology', 'politics', "adviceanimals", "videos", "pics", "funny", "wtf", "gaming"]
all_data_liwc = pd.read_csv(f"{LIWC_DIR}/all_subreddit_data_liwc_2024_14_subreddits.csv", index_col=0)

In [None]:
all_data_liwc.shape

In [None]:
def prepare_for_liwc(subreddits):
    dfs = []
    for subreddit in subreddits:
        all_data = pd.read_pickle(f"{PRE_LIWC_DIR}/{subreddit}_all_subreddit_data_pre_liwc.pkl").rename(columns={"body": "utterance"})
        all_data.to_csv(f"{subreddit}_all_subreddit_data_pre_liwc.csv")
        print(all_data.shape)
        dfs.append(all_data)
    mega = pd.concat(dfs)
    print(mega.shape)
    mega = mega.drop_duplicates("id")
    print(mega.shape)
    mega.to_csv(f"{LIWC_DIR}/all_subreddit_data_pre_liwc_2024_14_subreddits.csv")

# prepare_for_liwc(all_subreddits)

In [None]:
def get_toxicity_dict():
    files = ["hate_speech_march_5_toxicity.csv", "worldnews_til_toxicity.csv", "hate_speech_politics_technology_toxicity.csv", "hate_speech_all_used_data_toxicity.csv"]
    id_to_toxicity = {}
    for file in files:
        df = pd.read_csv(TOXICITY_DIR + file, index_col=0)
        file_dict = df.set_index("id")['toxicity'].to_dict()
        for curr_id in file_dict:
            id_to_toxicity[curr_id] = file_dict[curr_id]
    return id_to_toxicity


def add_toxicity_values(df):
    d = get_toxicity_dict()
    df['toxicity'] = df['id'].progress_apply(lambda x: d.get(x, np.nan))
    print(df['toxicity'].isna().sum())
    print(df['toxicity'].shape)
    print(df['toxicity'].isna().sum() / df['toxicity'].shape[0])
    df['toxicity'] = df['toxicity'].fillna(0)
    return df

In [None]:
def load_from_liwc(subreddit):
    all_data = pd.read_csv(f"{PRE_LIWC_DIR}/{subreddit}_all_subreddit_data_pre_liwc.csv", index_col=0)
    liwc_columns = all_data_liwc.columns[len(all_data.columns):].tolist()
    all_data_liwc.columns = all_data.columns.tolist() + liwc_columns
    curr_sub_liwc = all_data.merge(all_data_liwc[['id'] + liwc_columns], on="id")
    assert curr_sub_liwc.shape[0] == all_data.shape[0]
    curr_sub_liwc = add_toxicity_values(curr_sub_liwc)
    return all_data, curr_sub_liwc


def load_features(og_data, liwc_data):
    functional_syntactic_features = ['WC', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 'we',
                                     'you', 'shehe', 'they', 'ipron', 'article','prep', 'auxverb', 'conj', 'negate', 
                                     'interrog', 'number', 'quant', 'AllPunc', 'Period', 'Comma', 'Colon',
                                     'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth',   'OtherP',
                                      "ttr"]

    uncivil_features = ['valence', 'politeness', 'toxicity']
    
    liwc_semantic_features = list(set(liwc_data.columns).difference(functional_syntactic_features).difference(og_data.columns).difference(uncivil_features))

    top_level_features = ["AllPunc", "Dic", 'function', 'pronoun', 'ppron', 'affect', 'negemo', 'social', 'cogproc', 'percept', 'bio', 'time', 'drives', 'relativ', 'informal']
    
    
    functional_syntactic_features = [feature for feature in functional_syntactic_features if feature not in top_level_features]
    liwc_semantic_features = [feature for feature in liwc_semantic_features if feature not in top_level_features]
    
    with open(f"{DATA_DIR}/syntactic_features.txt", "w") as f:
        f.write("\t".join(functional_syntactic_features))
    with open(f"{DATA_DIR}/semantic_features.txt", "w") as f:
        f.write("\t".join(liwc_semantic_features))
    with open(f"{DATA_DIR}/uncivil_features.txt", "w") as f:
        f.write("\t".join(uncivil_features))
        
    print(f"# Syntactic Features: {len(functional_syntactic_features)}")
    print(f"# Semantic Features: {len(liwc_semantic_features)}")
    print(f"# Uncivil Features: {len(uncivil_features)}")
#     return functional_syntactic_features, liwc_semantic_features



def create_datasets(subreddit, liwc_data, to_save):#, semantic_features, to_save):
    training_data = liwc_data[liwc_data['is_training'] == 1]
    testing_data = liwc_data[liwc_data['is_training'] == 0]

    if to_save:
        training_data.to_csv(f"{TRAINING_DIR}/{subreddit}_training_data.csv", index=False)
        testing_data.to_csv(f"{TESTING_DIR}/{subreddit}_testing_data.csv", index=False)
    
    return training_data, testing_data


def preprocess_for_r(subreddit_of_interest):
    all_data, sub_liwc = load_from_liwc(subreddit_of_interest)
    load_features(all_data, sub_liwc)
    return create_datasets(subreddit_of_interest, sub_liwc, to_save=True)


# for sub in all_subreddits:
#     print(sub)
#     preprocess_for_r(sub)

In [None]:
def load_predicted_data(data_name, style_type, test_data, include_parent_features=False):
    predicted_data = pd.read_csv(f"{PREDICTIONS_DIR}/{data_name}_{style_type}_testing_data_predictions.csv", index_col=0)
    predicted_data.index = predicted_data.index - 1
#     print(predicted_data.groupby("label")['probs'].count())
#     print(predicted_data.groupby("label")['probs'].mean())
    
    test_data = test_data.reset_index(drop=True)
    test_data['probs'] = predicted_data['probs']
    
    test_posts = test_data[test_data['is_parent'] == 0]
    parent_posts = test_data[test_data['is_parent'] == 1]
    
    parent_posts = parent_posts.drop_duplicates("id")
    parent_posts = parent_posts.set_index("id").loc[test_posts['cropped_id']]
    assert parent_posts.shape[0] == test_posts.shape[0]
    parent_posts = parent_posts.reset_index()
    test_posts = test_posts.reset_index()
    
    test_posts['parent_author'] = parent_posts['author']
    test_posts['parent_probs'] = parent_posts['probs']
    test_posts['more_manospheric'] = test_posts['probs'] - test_posts['parent_probs']
    
    if include_parent_features:
        for feature in ['female', 'male', 'toxicity', 'politeness', 'valence']:
            test_posts[f"{feature}_parent"] = parent_posts[feature]
    
    validation_data_liwc = test_posts[test_posts['label'] == 2]
    test_data_liwc = test_posts[test_posts['label'] != 2]
    
    return validation_data_liwc, test_data_liwc

In [None]:
def compute_cohens_d(arr_1, arr_2):
    numerator = np.mean(arr_1) - np.mean(arr_2)
    denominator = np.sqrt((np.std(arr_1) ** 2 + np.std(arr_2) ** 2)/2)
    return np.round(numerator/denominator, 2)

In [None]:
def aggregate_data(validation_data_liwc, test_data_liwc, matched_authors):
    validation_agg = validation_data_liwc.groupby("author").mean()
    test_agg = test_data_liwc.groupby("author").mean()
    manosphere_agg = test_agg[test_agg['label'] == 1]
    baseline_agg = test_agg[test_agg['label'] == 0]
    
    if "autotldr" in test_agg.index:
        print("FOUND BOT")
        manosphere_agg = manosphere_agg[~manosphere_agg.index.isin(["autotldr"])]
        validation_agg = validation_agg[~validation_agg.index.isin(["autotldr"])]
        baseline_agg = baseline_agg[~baseline_agg.index.isin([matched_authors['autotldr']])]
        
        v = validation_data_liwc[~validation_data_liwc['author'].isin(["autotldr"])]
        print(v.groupby("label")[['author', 'id']].nunique())
        
        m = test_data_liwc[~test_data_liwc['author'].isin(["autotldr", matched_authors['autotldr']])]
        print(m.groupby("label")[['author', 'id']].nunique())
#         assert baseline_agg.shape[0] == manosphere_agg.shape[0]
        print(manosphere_agg.shape)
    print(baseline_agg['probs'].mean())
    print(manosphere_agg['probs'].mean())
    print(validation_agg['probs'].mean())
    
    return validation_agg, manosphere_agg, baseline_agg

In [None]:
def main(data_name, style_type, metadata, remove_manospheric_parents):
    testing_data = pd.read_csv(f"{TESTING_DIR}/{data_name}_testing_data.csv")
    validation_data_liwc, test_data_liwc = load_predicted_data(data_name, style_type, testing_data)
    
        
    validation_data, incel_data, control_data = aggregate_data(validation_data_liwc, test_data_liwc, metadata['matched_authors'])
    stats, tests = compute_statistics(validation_data, incel_data, control_data, metadata)    
    relevant_data = {"baseline": control_data, "manosphere": incel_data, "validation": validation_data}
    return pd.DataFrame(stats).T, pd.DataFrame(tests).T, relevant_data

def main_wrapper(subreddits, style_type, remove_manospheric_parents=False, remove_responses_to_manospheric_posts=False):
    all_stats = []
    all_tests = []
    all_data = {}
    extended_subcultures = ["Manosphere"]# + subcultures
    
    for subreddit in subreddits:
        metadata = {}
        metadata['subreddit'] = subreddit
        metadata["matched_authors"] = pd.read_pickle(f"{MATCHED_TESTING_AUTHORS_DIR}/{subreddit}_matched_authors.pkl")
        
        for subculture in extended_subcultures:
            metadata['subculture'] = subculture
            data_name = subreddit if subculture == "Manosphere" else f"{subreddit}_{subculture}"
            stats, tests, data = main(data_name, style_type, metadata, remove_manospheric_parents, remove_responses_to_manospheric_posts)
            all_stats.append(stats)
            all_tests.append(tests)
            all_data[subreddit] = data
    
    return all_stats, all_tests, all_data




def compute_statistics(validation_data, incel_data, control_data, metadata, remove_responses_to_manospheric_posts):
    
    subreddit = metadata['subreddit']
    subculture = metadata['subculture']
    matched_authors = metadata['matched_authors']
    
    # Order the baseline data appropriately
    if remove_responses_to_manospheric_posts:
        ttest_func = lambda x, y: ttest_ind(x, y)
        cohens_d_func = lambda x, y: compute_cohens_d(x, y)
    else:
        ttest_func = lambda x, y: ttest_rel(x, y)
        cohens_d_func = lambda x, y: compute_cohens_d(x - y, np.zeros(len(incel_data['probs'])))
        control_data = control_data.loc[[matched_authors[author] for author in incel_data.index]]
    
    # Compute group-level statistics
    descriptive_statistics = {}
    for df, title in zip([control_data, incel_data, validation_data], ['Baseline', "Test", "Validation"]):
        descriptive_statistics[(subreddit, subculture, title)] = {
                                 "num_users": len(df['probs']),
                                 "reply_mano_mean": np.mean(df['probs']),
                                 "reply_mano_std": np.std(df['probs']),
                                 "parent_mano_mean": np.mean(df['parent_probs']),
                                 "parent_mano_std": np.std(df['parent_probs']),
                                 "more_mano_mean": np.mean(df['more_manospheric']),
                                 "more_mano_std": np.std(df['more_manospheric'])}    

    # Perform Hypothesis Testing
    print("Hypothesis Tests")
    hypothesis_test_data = {}
  
    print("\n Test 1: Manospheric Replies vs. On-Manosphere Replies")
    print("Matched pairs t-test")
    tstat, p_val = ttest_func(incel_data['probs'], validation_data['probs'])
    cohens_d = cohens_d_func(incel_data['probs'], validation_data['probs'])
    hypothesis_test_data[(subreddit, subculture, "compare_to_validation")] = {"statistic": tstat, "p_val": p_val, "cohens_d": cohens_d}
    print(f"Statistic: {np.round(tstat, 2)}\np-value: {np.round(p_val, 6)}\nCohen's D: {np.round(cohens_d, 2)}")
    
    
    print("\n Test 2: Manospheric Replies vs. Control Replies")
    print("Matched pairs t-test")
    tstat, p_val = ttest_func(incel_data['probs'], control_data['probs'])
    cohens_d = cohens_d_func(incel_data['probs'].to_numpy(),  control_data['probs'].to_numpy())
    hypothesis_test_data[(subreddit, subculture, "compare_to_control")] = {"statistic": tstat, "p_val": p_val, "cohens_d": cohens_d}
    print(f"Statistic: {np.round(tstat, 2)}\np-value: {np.round(p_val, 6)}\nCohen's D: {np.round(cohens_d, 2)}")
    
    
    print("\n Test 3: Manospheric Parents vs. Control Parents")
    print("Matched pairs t-test")
    tstat, p_val = ttest_func(incel_data['parent_probs'], control_data['parent_probs'])
    cohens_d = cohens_d_func(incel_data['parent_probs'].to_numpy(), control_data['parent_probs'].to_numpy())
    hypothesis_test_data[(subreddit, subculture, "compare_parents")] = {"statistic": tstat, "p_val": p_val, "cohens_d": cohens_d}
    print(f"Statistic: {np.round(tstat, 2)}\np-value: {np.round(p_val, 6)}\nCohen's D: {np.round(cohens_d, 2)}")
    
    
#     print("\n Test 1: Manospheric Replies vs. On-Manosphere Replies")
#     print("Matched pairs t-test")
#     tstat, p_val = ttest_rel(incel_data['probs'], validation_data['probs'])
#     cohens_d = compute_cohens_d(incel_data['probs']- validation_data['probs'], np.zeros(len(incel_data['probs'])))
#     hypothesis_test_data[(subreddit, subculture, "compare_to_validation")] = {"statistic": tstat, "p_val": p_val, "cohens_d": cohens_d}
#     print(f"Statistic: {np.round(tstat, 2)}\np-value: {np.round(p_val, 6)}\nCohen's D: {np.round(cohens_d, 2)}")
    
    
#     print("\n Test 2: Manospheric Replies vs. Control Replies")
#     print("Matched pairs t-test")
#     tstat, p_val = ttest_rel(incel_data['probs'], control_data['probs'])
#     cohens_d = compute_cohens_d(incel_data['probs'].to_numpy() - control_data['probs'].to_numpy(), np.zeros(len(incel_data['probs'])))
#     hypothesis_test_data[(subreddit, subculture, "compare_to_control")] = {"statistic": tstat, "p_val": p_val, "cohens_d": cohens_d}
#     print(f"Statistic: {np.round(tstat, 2)}\np-value: {np.round(p_val, 6)}\nCohen's D: {np.round(cohens_d, 2)}")
    
    
#     print("\n Test 3: Manospheric Parents vs. Control Parents")
#     print("Matched pairs t-test")
#     tstat, p_val = ttest_rel(incel_data['parent_probs'], control_data['parent_probs'])
#     cohens_d = compute_cohens_d(incel_data['parent_probs'].to_numpy() - control_data['parent_probs'].to_numpy(), np.zeros(len(incel_data['parent_probs'])))
#     hypothesis_test_data[(subreddit, subculture, "compare_parents")] = {"statistic": tstat, "p_val": p_val, "cohens_d": cohens_d}
#     print(f"Statistic: {np.round(tstat, 2)}\np-value: {np.round(p_val, 6)}\nCohen's D: {np.round(cohens_d, 2)}")
    

    return descriptive_statistics, hypothesis_test_data

In [None]:
# subreddit_of_interest = 'askmen'
# style_type = 'all_features_reduced'
for style_type in ['emnlp_data']:
    all_stats, all_tests, all_mano_data = main_wrapper(all_subreddits, style_type)

In [None]:
# Create two subplots and unpack the output array immediately
def plot_subreddit_data(validation_data, incel_data, control_data, style_type, plot_metadata):
    
    plt.rcParams.update({"font.size": 15})
    subreddit = plot_metadata['subreddit']
    subculture = plot_metadata['subculture']
    
    fontsize = plot_metadata['fontsize']
    figsize_x = plot_metadata['figsize_x'] 
    figsize_y = plot_metadata['figsize_y']
    
    fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(figsize_x, figsize_y))

    # Plot 1: Reply Results
    sns.kdeplot(control_data["probs"], label=f"Baseline on r/{subreddit}", ax=ax1, linestyle="dashed", legend=False, color="#f8766d")
    sns.kdeplot(validation_data["probs"], label=f"{subculture} in {subculture}", ax=ax1, linestyle="dashdot", legend=False, color="#00bfc4")
    sns.kdeplot(incel_data["probs"], label=f"{subculture} on r/{subreddit}", ax=ax1, linestyle="solid", legend=False, color="green")    

    
    # Plot 2: Parent Results
    sns.kdeplot(control_data["parent_probs"], label=f"Baseline on r/{subreddit}", ax=ax2, linestyle="dashed", legend=False, color="#f8766d")
#     sns.kdeplot(validation_data["parent_probs"], label=f"{subculture} in {subculture}", ax=ax2, linestyle="dashdot", legend=False, color="#00bfc4")
    sns.kdeplot(incel_data["parent_probs"], label=f"{subculture} on r/{subreddit}", ax=ax2, linestyle="solid", legend=False, color="green")    

    
    
#     # Plot 3: Reply - Parent Results
#     sns.kdeplot(control_data["more_manospheric"], label=f"{subreddit} on {subreddit}", ax=ax2, linestyle="dashed", legend=False, color="#f8766d")
#     sns.kdeplot(incel_data["more_manospheric"], label=f"{subculture} on r/{subreddit}", ax=ax2, linestyle="solid", legend=False, color="green")
    
    ax1.set_xlabel(f"Reply {subculture}-ness", fontsize=fontsize, labelpad=20)
    ax1.set_xlim(0.15, 0.85)
    ax1.set_ylim(0, 12)
    ax2.set_xlabel(f"Parent {subculture}-ness", fontsize=fontsize, labelpad=20)
    
#     ax2.set_xlabel(f"Reply {subculture}-ness - \nParent {subculture}-ness\n", fontsize=fontsize, labelpad=20)
    ax1.set_ylabel("Density", fontsize=fontsize, labelpad=20)
    ax2.set_xlim(0.15, 0.85)
    
#     ax2.set_xlim(-0.45, 0.45)
    ax2.set_ylim(0, 12)
    ax1.tick_params(axis='both', which='major', labelsize=fontsize)
    ax2.tick_params(axis='both', which='major', labelsize=fontsize)

    handles, labels = ax1.get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(0.2, 0.9), fancybox=True, shadow=True, fontsize=25)    
    fig.suptitle(f"{subculture}-ness Distributions on \n r/{subreddit} (n={incel_data.shape[0]})", fontsize=fontsize+10, y=1.02)
    plt.savefig(f"{subreddit}_{subculture}ness_{style_type}.png", bbox_inches='tight')
    plt.show()

In [None]:
# Create two subplots and unpack the output array immediately
def plot_shifting_data(subreddit_data, group_type, plot_metadata):
    
    fontsize = plot_metadata['fontsize']
    figsize_x = plot_metadata['figsize_x'] 
    figsize_y = plot_metadata['figsize_y']
    plt.rcParams.update({"font.size": fontsize})
    
    sub_to_name = {
        "askreddit": "AskReddit",
        "news": "News",
        "worldnews": "WorldNews",
        "todayilearned": "TodayILearned",
        "askmen": "AskMen",
        "movies": "Movies",
        "politics": "Politics",
        "technology": "Technology",
        "adviceanimals": "AdviceAnimals",
        "wtf": "wtf",
        "videos": "Videos",
        "pics": "Pics",
        "funny": "Funny",
        "gaming": "Gaming"      
        
    }
    
    fig, axes = plt.subplots(7, 2, sharey=True, sharex=True, figsize=(figsize_x, figsize_y))
    axes = axes.reshape(-1, )
    
    if group_type == "reply":
        field = "probs"
        plt.setp(axes, xlim=[0.15, 0.85], ylim=[0, 9], yticks=[0, 2, 4, 6, 8])
    elif group_type == 'parent':
        field = 'parent_probs'
        plt.setp(axes, xlim=[0.2, 0.8], ylim=[0, 15.5], yticks=[0, 2, 4, 6, 8, 10, 12, 14])
        
    for subreddit, ax in zip(subreddit_data, axes):
        
        control_data = subreddit_data[subreddit]['baseline']
        incel_data = subreddit_data[subreddit]['manosphere']
        
        sns.kdeplot(control_data[field], label=f"Baseline", ax=ax, linestyle="dashed", legend=False, color="#cc0000")
        sns.kdeplot(incel_data[field], label=f"Mano (outside Mano)", ax=ax, linestyle="solid", legend=False, color="green")    
        
        if group_type == "reply":          
            validation_data = subreddit_data[subreddit]['validation']
            sns.kdeplot(validation_data[field], label=f"Mano (on Mano)", ax=ax, linestyle="dashdot", legend=False, color="#1155cc")

        ax.set_title(f"r/{sub_to_name[subreddit]} (n={control_data.shape[0]})")
        ax.set(xlabel=None)
        ax.set(ylabel=None)
    
    fig.supxlabel(f'{group_type.title()} Manosphericness', fontsize=40, y=0.07)
    fig.supylabel('Density', fontsize=40, x=0.05)

    handles, labels = ax.get_legend_handles_labels()
#     fig.legend(handles, labels, loc='upper right', bbox_to_anchor=(1, 0.85), fancybox=True, shadow=True, fontsize=20)    
#     fig.suptitle(f"{subculture}-ness Distributions on \n r/{subreddit} (n={incel_data.shape[0]})", fontsize=fontsize+10, y=1.02)
    plt.savefig(f"{group_type}.png", bbox_inches='tight')
    plt.show()

In [None]:
# Create two subplots and unpack the output array immediately
def plot_shifting_data_three(subreddit_data, group_type, plot_metadata):
    
    fontsize = plot_metadata['fontsize']
    figsize_x = plot_metadata['figsize_x'] 
    figsize_y = plot_metadata['figsize_y']
    plt.rcParams.update({"font.size": fontsize})
    
    sub_to_name = {
        "askreddit": "AskReddit",
        "news": "News",
        "worldnews": "WorldNews",
        "todayilearned": "TodayILearned",
        "askmen": "AskMen",
        "movies": "Movies",
        "politics": "Politics",
        "technology": "Technology",
        "adviceanimals": "AdviceAnimals",
        "wtf": "wtf",
        "videos": "Videos",
        "pics": "Pics",
        "funny": "Funny",
        "gaming": "Gaming"      
        
    }
    
    fig, axes = plt.subplots(5, 3, sharey=True, sharex=True, figsize=(figsize_x, figsize_y))
    axes = axes.reshape(-1, )
#     [axes]
    
    if group_type == "reply":
        field = "probs"
        plt.setp(axes, xlim=[0.15, 0.85], ylim=[0, 9], yticks=[0, 2, 4, 6, 8])
    elif group_type == 'parent':
        field = 'parent_probs'
        plt.setp(axes, xlim=[0.2, 0.8], ylim=[0, 15.5], yticks=[0, 2, 4, 6, 8, 10, 12, 14])
        
    for subreddit, ax in zip(subreddit_data, axes):
        
        control_data = subreddit_data[subreddit]['baseline']
        incel_data = subreddit_data[subreddit]['manosphere']
        
        sns.kdeplot(control_data[field], label=f"Baseline Authors", ax=ax, linestyle="dashed", legend=False, color="#cc0000")
        sns.kdeplot(incel_data[field], label=f"Manospheric Authors\n(Outside Manosphere)", ax=ax, linestyle="solid", legend=False, color="green")    
        
        if group_type == "reply":          
            validation_data = subreddit_data[subreddit]['validation']
            sns.kdeplot(validation_data[field], label=f"Manospheric Authors\n(Inside Manosphere)", ax=ax, linestyle="dashdot", legend=False, color="#1155cc")

        ax.set_title(f"r/{sub_to_name[subreddit]} (n={control_data.shape[0]})")
        ax.set(xlabel=None)
        ax.set(ylabel=None)
    
    fig.supxlabel(f'Manosphericness', fontsize=30, y=0.07)
    fig.supylabel('Density', fontsize=30, x=0.07)

    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper right', bbox_to_anchor=(0.95, 0.24), fancybox=True, shadow=True, fontsize=20)
    fig.delaxes(axes[-1])
#     fig.suptitle(f"{subculture}-ness Distributions on \n r/{subreddit} (n={incel_data.shape[0]})", fontsize=fontsize+10, y=1.02)
    plt.savefig(f"{group_type}.png", bbox_inches='tight')
    plt.show()

In [None]:
metadata = {
        "fontsize": 20, 
        "figsize_x": 15,
        "figsize_y": 20
    }
plot_shifting_data_three(all_mano_data, "reply", metadata)

In [None]:
metadata = {
        "fontsize": 20, 
        "figsize_x": 15,
        "figsize_y": 4
    }
plot_shifting_data_three({key: all_mano_data[key] for key in ['worldnews', 'funny', 'askreddit']}, "reply", metadata)

In [None]:
# All testing data
subreddit_to_training_data = {}
for subreddit in all_subreddits:
    training_data = pd.read_csv(f"{TRAINING_DIR}/{subreddit}_training_data.csv")
    subreddit_to_training_data[subreddit] = training_data

In [None]:
subreddit_to_train_num = {subreddit: {} for subreddit in all_subreddits}
for subreddit in all_subreddits:
    curr = subreddit_to_training_data[subreddit].groupby("label")[['author', 'id']].nunique()
    num_baseline = f"{curr.loc[0]['id']} posts/{curr.loc[0]['author']} authors"
    num_mano = f"{curr.loc[1]['id']} posts/{curr.loc[1]['author']} authors"
    subreddit_to_train_num[subreddit]['M'] = num_mano
    subreddit_to_train_num[subreddit]['B'] = num_baseline

In [None]:
pd.DataFrame(subreddit_to_train_num).T

In [None]:
# All testing data
subreddit_to_test_data = {}
for subreddit in all_subreddits:
    testing_data = pd.read_csv(f"{TESTING_DIR}/{subreddit}_testing_data.csv")
    subreddit_to_test_data[subreddit] = testing_data

In [None]:
subreddit_to_test_num = {subreddit: {} for subreddit in all_subreddits}
for subreddit in all_subreddits:
    curr = subreddit_to_test_data[subreddit].groupby("label")[['author', 'id']].nunique()
    num_baseline = f"{curr.loc[0]['id']} posts/{curr.loc[0]['author']} authors"
    num_mano = f"{curr.loc[1]['id']} posts/{curr.loc[1]['author']} authors"
    num_validation = f"{curr.loc[2]['id']} posts/{curr.loc[2]['author']} authors"
    subreddit_to_test_num[subreddit]['M (in Manosphere)'] = num_validation
    subreddit_to_test_num[subreddit]['M (on $S$)'] = num_mano
    subreddit_to_test_num[subreddit]['B (on $S$)'] = num_baseline

In [None]:
pd.DataFrame(subreddit_to_test_num).T

In [None]:
# All testing data
subreddit_to_predicted_data = {}
subreddit_to_validation_data = {}
for subreddit in all_subreddits:
    validation_data, predicted_data = load_predicted_data(subreddit, "emnlp_data", subreddit_to_test_data[subreddit])
    subreddit_to_predicted_data[subreddit] = predicted_data
    subreddit_to_validation_data[subreddit] = validation_data

In [None]:
subreddit_to_validation_data['askreddit'].sort_values(by='probs').tail(200).iloc[4]#['utterance']

In [None]:
training_data = pd.read_csv(TRAINING_DIR + 'full_sample_14_subreddits_training_data.csv')

In [None]:
training_data.groupby("label")[['female', 'male', 'shehe', 'you', 'toxicity', 'Sixltr', 'politeness']].mean()

In [None]:
sub_to_gender_counts = {}
for subreddit in all_subreddits:
    curr = subreddit_to_predicted_data[subreddit].groupby("label")[['female', 'male', 'shehe']].mean()
    diffs = curr.loc[1] - curr.loc[0]
#     curr = pd.DataFrame(pd.concat({subreddit: diffs}, names=['Subreddit']))
    sub_to_gender_counts[subreddit] = diffs
pd.DataFrame(sub_to_gender_counts).round(3).mean(axis=1)

In [None]:
descriptive_statistics = pd.concat(all_stats)
descriptive_statistics.index = descriptive_statistics.index.set_names(['subreddit', 'subculture', 'group'])
num_users = pd.DataFrame(descriptive_statistics['num_users'])
num_users = num_users.droplevel(2, axis=0).drop_duplicates()

hypothesis_tests = pd.concat(all_tests)
hypothesis_tests.index = hypothesis_tests.index.set_names(['subreddit', 'subculture', 'test'])
hypothesis_tests =  hypothesis_tests.combine_first(num_users)

global_results = hypothesis_tests[hypothesis_tests.index.get_level_values(1) == "Manosphere"]

global_results.xs("compare_to_control", level=2)[['statistic', "p_val", "cohens_d"]].loc[all_subreddits]

# Feature Shifting

In [None]:
all_test_data = []
for subreddit in all_subreddits:
    testing_data = pd.read_csv(f"{TESTING_DIR}/{subreddit}_testing_data.csv")
    validation_data_liwc, test_data_liwc = load_predicted_data(subreddit, 'emnlp_data', testing_data, include_parent_features=True)
    all_test_data.append(test_data_liwc)

all_test_data_df = pd.concat(all_test_data)

In [None]:
all_test_data_df = all_test_data_df[['label', 'toxicity', 'politeness', 'valence', 'female', 'male', 'toxicity_parent', 'politeness_parent', 'valence_parent', 'female_parent', 'male_parent']]

In [None]:
for feature in all_test_data_df.columns[1:]:
    col = all_test_data_df[feature]
    all_test_data_df[feature] = (col - col.mean())/col.std()

In [None]:
all_test_data_df.to_csv(f"{TESTING_DIR}/feature_switching_emnlp_data.csv")