In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.append("../src/")
import json

import numpy as np
import pandas as pd
import math
from datasets import Dataset, list_metrics, load_metric

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from results_analysis.analyze_exp_results import ExpAnalyzer
from results_analysis.analyze_personalized_exp_results import PerPopulationExpAnalyzer
from runners.runner_utils import loss_to_perplexity

In [3]:
base_result_dir = "/data/ddmg/redditlanguagemodeling/results/"

In [4]:
data_df = pd.read_csv("/data/ddmg/redditlanguagemodeling/data/3_all_data_user_filtered_2021-06-21.csv")
data_df.drop(columns="Unnamed: 0", inplace=True)

In [5]:
train_df = data_df[data_df["data_split"] == "train"]
val_df = data_df[data_df["data_split"] == "val"]
test_df = data_df[data_df["data_split"] == "test"]
val_users = list(val_df.author.unique())

In [6]:
len(val_users)

50

In [7]:
eval_user_df = data_df[data_df["author"].isin(val_users)]

In [8]:
eval_user_df = eval_user_df.sort_values("author")

In [9]:
eval_user_df

Unnamed: 0,id,author,data_split,subreddit,text,created_utc
18212,gxoqd4,BRoccoli20,train,mentalhealth,I am a cold-hearted monster ÔwÔ My paternal gr...,1.591440e+09
18500,k16t8v,BRoccoli20,train,mentalhealth,"What do you think of this reaction? Yesterday,...",1.606359e+09
18367,ivhx0w,BRoccoli20,train,mentalhealth,i don’t know what to do... i just opened my em...,1.600475e+09
18341,iqi5vq,BRoccoli20,train,mentalhealth,"4:39 am I’m here, sat on my bed, on reddit...*...",1.599792e+09
18332,ifk0fx,BRoccoli20,train,mentalhealth,Why does the thought of going to school upset ...,1.598251e+09
...,...,...,...,...,...,...
22062,bj1djy,your_godammn_right,val,mentalhealth,Cancelled my Psychologist from a lack of conne...,1.556617e+09
22573,bjrdzo,your_godammn_right,test,mentalhealth,as it is Borderline Personality Disorder aware...,1.556776e+09
22572,bjrn1a,your_godammn_right,test,mentalhealth,My doctor said he does not know if i have BPD ...,1.556778e+09
22570,ad4gf6,your_godammn_right,test,mentalhealth,11 year old neighbour is pushing me to the poi...,1.546770e+09


In [10]:
user_count_df = eval_user_df.groupby("author").count()["id"].to_frame().rename(columns={"id": "count"})
for subreddit in eval_user_df["subreddit"].unique():
    user_count_df["{}_count".format(subreddit)] = 0

In [11]:
user_sr_count_df = eval_user_df.groupby(["author", "subreddit"]).count()["id"].to_frame()
user_sr_count_df

Unnamed: 0_level_0,Unnamed: 1_level_0,id
author,subreddit,Unnamed: 2_level_1
BRoccoli20,AskDocs,1
BRoccoli20,mentalhealth,35
Chad_arbc,investing,50
Creative310,AskDocs,9
Creative310,Bitcoin,25
...,...,...
svet_sedov,Bitcoin,81
wsace,Bitcoin,1
wsace,investing,33
your_godammn_right,AskDocs,1


In [12]:
for idx, row in user_count_df.iterrows():
    for subreddit in eval_user_df["subreddit"].unique():
        if (idx, subreddit) in user_sr_count_df.index:
            user_count_df.loc[idx]["{}_count".format(subreddit)] = user_sr_count_df.loc[(idx, subreddit)]["id"]

In [13]:
user_count_df

Unnamed: 0_level_0,count,mentalhealth_count,AskDocs_count,investing_count,Bitcoin_count,relationship_advice_count,relationships_count
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BRoccoli20,36,35,1,0,0,0,0
Chad_arbc,50,0,0,50,0,0,0
Creative310,35,0,9,0,25,1,0
Dotabjj,31,0,0,0,31,0,0
Eriflee,32,0,4,0,0,0,28
GaniB,32,0,0,32,0,0,0
HelloImLucas,34,15,19,0,0,0,0
IckyBelly,30,0,30,0,0,0,0
KarmaKingKong,110,0,0,88,22,0,0
Kennyv777,31,0,31,0,0,0,0


In [14]:
# collect results
result_dfs = []
# pre-trained model
rd1 = os.path.join(base_result_dir, "1_bert_distill_pretrained")
ea1 = ExpAnalyzer(rd1)
ea1.author_result_df["method"] = "pre-trained"
result_dfs.append(ea1.author_result_df)
#fine-tune all
rd_2b = os.path.join(base_result_dir, "2b_distilbert_finetuned_constant_lr")
ea_2b = ExpAnalyzer(rd_2b)
ea_2b.author_result_df["method"] = "fine-tune_all"
result_dfs.append(ea_2b.author_result_df)
#fine-tune user
rd_3a = os.path.join(base_result_dir, "3a_distilbert_finetuned_author_constant_lr")
ea_3a = PerPopulationExpAnalyzer(rd_3a, "author")
test_df_3a = ea_3a.data_split_perf_df[ea_3a.data_split_perf_df["data_split"] == "test"]
test_df_3a["method"] = "fine-tune_user"
test_df_3a.drop(columns="data_split", inplace=True)
result_dfs.append(test_df_3a)
#fine-tune subreddit
rd_4b = os.path.join(base_result_dir, "4b_distilbert_finetuned_subreddit_sample_eval")
ea_4b = PerPopulationExpAnalyzer(rd_4b, "subreddit")
merged_df_4b = ea_4b.sample_losses_df.merge(test_df, on="id", how="inner")
user_results_4b = merged_df_4b.groupby("author").mean()["loss"].to_frame()
user_results_4b["perplexity"] = user_results_4b["loss"].apply(lambda x: loss_to_perplexity(x))
user_results_4b["method"] = "fine-tune_subreddit"
result_dfs.append(user_results_4b.reset_index())
# fine-tune all (2 epochs) + user (1 epoch)
rd_6_21 = os.path.join(base_result_dir, "6_distilbert_finetuned_all_author", "2_1_epochs_lr_5e-5")
ea_6_21 = PerPopulationExpAnalyzer(rd_6_21, "author")
test_df_6_21 = ea_6_21.data_split_perf_df[ea_6_21.data_split_perf_df["data_split"] == "test"]
test_df_6_21["method"] = "fine-tune_all_user"
test_df_6_21.drop(columns="data_split", inplace=True)
result_dfs.append(test_df_6_21)
result_df = pd.concat(result_dfs)

Found results for 50 sub-populations


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_3a["method"] = "fine-tune_user"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Found results for 6 sub-populations
Found results for 50 sub-populations


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_6_21["method"] = "fine-tune_all_user"


In [15]:
result_df = result_df.sort_values(["author", "perplexity"])
result_df = result_df.reset_index(drop=True)

In [16]:
idx = result_df.groupby("author")['perplexity'].transform(min) == result_df['perplexity']
best_method_df = result_df[idx]
best_method_df = best_method_df.set_index("author")
best_method_df

Unnamed: 0_level_0,loss,perplexity,method
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BRoccoli20,1.992776,7.335873,fine-tune_all
Chad_arbc,2.170573,8.7633,fine-tune_subreddit
Creative310,2.161885,8.687502,fine-tune_subreddit
Dotabjj,3.281342,26.611456,fine-tune_all_user
Eriflee,1.929266,6.884453,fine-tune_all
GaniB,1.782997,5.947657,fine-tune_all
HelloImLucas,2.221523,9.221367,fine-tune_all_user
IckyBelly,2.459537,11.699397,fine-tune_all
KarmaKingKong,2.015775,7.506542,fine-tune_all
Kennyv777,2.049708,7.765631,fine-tune_all


In [17]:
user_info_df = user_count_df.merge(best_method_df, left_index=True, right_index=True, how="inner")

In [18]:
user_info_df.rename(columns={"relationship_advice_count": "RA", "mentalhealth_count": "MH"})

Unnamed: 0_level_0,count,MH,AskDocs_count,investing_count,Bitcoin_count,RA,relationships_count,loss,perplexity,method
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRoccoli20,36,35,1,0,0,0,0,1.992776,7.335873,fine-tune_all
Chad_arbc,50,0,0,50,0,0,0,2.170573,8.7633,fine-tune_subreddit
Creative310,35,0,9,0,25,1,0,2.161885,8.687502,fine-tune_subreddit
Dotabjj,31,0,0,0,31,0,0,3.281342,26.611456,fine-tune_all_user
Eriflee,32,0,4,0,0,0,28,1.929266,6.884453,fine-tune_all
GaniB,32,0,0,32,0,0,0,1.782997,5.947657,fine-tune_all
HelloImLucas,34,15,19,0,0,0,0,2.221523,9.221367,fine-tune_all_user
IckyBelly,30,0,30,0,0,0,0,2.459537,11.699397,fine-tune_all
KarmaKingKong,110,0,0,88,22,0,0,2.015775,7.506542,fine-tune_all
Kennyv777,31,0,31,0,0,0,0,2.049708,7.765631,fine-tune_all


In [19]:
from scipy.stats import entropy

In [20]:
sr_cols = ["{}_count".format(sr) for sr in eval_user_df["subreddit"].unique()]
user_info_df['sr_entropy'] = user_info_df.apply(lambda x: entropy(list(x[sr_cols])), axis=1)

In [21]:
user_info_df.rename(columns={"relationship_advice_count": "RA", "mentalhealth_count": "MH"})

Unnamed: 0_level_0,count,MH,AskDocs_count,investing_count,Bitcoin_count,RA,relationships_count,loss,perplexity,method,sr_entropy
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BRoccoli20,36,35,1,0,0,0,0,1.992776,7.335873,fine-tune_all,0.126931
Chad_arbc,50,0,0,50,0,0,0,2.170573,8.7633,fine-tune_subreddit,0.0
Creative310,35,0,9,0,25,1,0,2.161885,8.687502,fine-tune_subreddit,0.69115
Dotabjj,31,0,0,0,31,0,0,3.281342,26.611456,fine-tune_all_user,0.0
Eriflee,32,0,4,0,0,0,28,1.929266,6.884453,fine-tune_all,0.37677
GaniB,32,0,0,32,0,0,0,1.782997,5.947657,fine-tune_all,0.0
HelloImLucas,34,15,19,0,0,0,0,2.221523,9.221367,fine-tune_all_user,0.686211
IckyBelly,30,0,30,0,0,0,0,2.459537,11.699397,fine-tune_all,0.0
KarmaKingKong,110,0,0,88,22,0,0,2.015775,7.506542,fine-tune_all,0.500402
Kennyv777,31,0,31,0,0,0,0,2.049708,7.765631,fine-tune_all,0.0


In [22]:
from transformers import AutoTokenizer

In [23]:
tokenizer= AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

In [24]:
np.unique(tokenizer(eval_user_df.iloc[0]['text'])['input_ids'][1:-1], return_counts=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


(array([  999,  1006,  1007,  1008,  1009,  1010,  1011,  1012,  1013,
         1015,  1016,  1024,  1029,  1037,  1045,  1049,  1055,  1056,
         1061,  1521,  1523,  1524,  1996,  1997,  1998,  1999,  2000,
         2001,  2002,  2003,  2004,  2005,  2006,  2007,  2008,  2009,
         2010,  2011,  2012,  2014,  2015,  2016,  2017,  2021,  2023,
         2024,  2025,  2026,  2030,  2031,  2032,  2033,  2036,  2041,
         2042,  2043,  2044,  2048,  2054,  2055,  2056,  2058,  2061,
         2062,  2064,  2065,  2066,  2074,  2079,  2080,  2081,  2085,
         2086,  2106,  2108,  2113,  2123,  2125,  2128,  2129,  2130,
         2134,  2135,  2138,  2140,  2145,  2146,  2154,  2156,  2157,
         2172,  2175,  2185,  2191,  2200,  2205,  2215,  2216,  2220,
         2222,  2227,  2242,  2243,  2253,  2258,  2275,  2292,  2296,
         2332,  2347,  2348,  2360,  2364,  2371,  2391,  2412,  2424,
         2428,  2435,  2467,  2471,  2480,  2485,  2502,  2505,  2514,
      

In [25]:
user_info_df["unique_token_count"] = 0
user_info_df["token_entropy"] = 0

In [26]:
for idx, row in user_info_df.iterrows():
    author_df = eval_user_df[eval_user_df["author"] == idx]
    tokens = []
    for post in author_df["text"].values:
        post_tokens = tokenizer(post, truncation=True)['input_ids'][1:-1]
        tokens += post_tokens
    vals, counts = np.unique(tokens, return_counts=True)
    user_info_df.at[idx, "unique_token_count"] = len(vals)
    user_info_df.at[idx, "unique_token_count/post"] = len(vals) / len(author_df)
    user_info_df.at[idx, "token_entropy"] = entropy(counts)

In [27]:
user_info_df

Unnamed: 0_level_0,count,mentalhealth_count,AskDocs_count,investing_count,Bitcoin_count,relationship_advice_count,relationships_count,loss,perplexity,method,sr_entropy,unique_token_count,token_entropy,unique_token_count/post
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
BRoccoli20,36,35,1,0,0,0,0,1.992776,7.335873,fine-tune_all,0.126931,1790,5,49.722222
Chad_arbc,50,0,0,50,0,0,0,2.170573,8.7633,fine-tune_subreddit,0.0,993,5,19.86
Creative310,35,0,9,0,25,1,0,2.161885,8.687502,fine-tune_subreddit,0.69115,1119,5,31.971429
Dotabjj,31,0,0,0,31,0,0,3.281342,26.611456,fine-tune_all_user,0.0,720,5,23.225806
Eriflee,32,0,4,0,0,0,28,1.929266,6.884453,fine-tune_all,0.37677,2129,6,66.53125
GaniB,32,0,0,32,0,0,0,1.782997,5.947657,fine-tune_all,0.0,546,5,17.0625
HelloImLucas,34,15,19,0,0,0,0,2.221523,9.221367,fine-tune_all_user,0.686211,978,5,28.764706
IckyBelly,30,0,30,0,0,0,0,2.459537,11.699397,fine-tune_all,0.0,1809,6,60.3
KarmaKingKong,110,0,0,88,22,0,0,2.015775,7.506542,fine-tune_all,0.500402,1467,6,13.336364
Kennyv777,31,0,31,0,0,0,0,2.049708,7.765631,fine-tune_all,0.0,1415,5,45.645161


In [28]:
user_info_df.loc["retarded_investor"]

KeyError: 'retarded_investor'

In [87]:
def print_user_posts(user, count, seed=42):
    user_df = eval_user_df[eval_user_df["author"] == user]
    np.random.seed(seed)
    post_idxs = np.random.choice(len(user_df), count)
    for idx in post_idxs:
        print("Post {} from subreddit {}".format(idx, user_df.iloc[idx]["subreddit"]))
        print()
        print(user_df.iloc[idx]["text"])
        print("\n")

In [92]:
print_user_posts("bluejeans90210", 20)

Post 28 from subreddit relationships

She kissed my neck a month ago, will I creep her out by bringing it up? [36m, under 25 f]  \*\*TL;DR;\*\* : stripper kissed me a while ago and isn't interested in my money

&#x200B;

Ok  the first thing I want to clear up is that she's a stripper, but she  didn't want my money and spent a lot of time holding hands and hugging  when she could have been soliciting other clients.  The kiss was hugely  unexpected and I didn't reciprocate because I didn't know if she  intended a neck kiss to be intimate (maybe she sloppily missed) and the  club has a no kissing policy.

I  asked her friend about whether she might be into me and reminded me she  lives in a different state (I wasn't getting at a relationship) and  that I should just ask her in future if I'm unsure about reciprocating.   Since this was a month ago, I guess I shouldn't bring it up and just  see what happens this time around?  I'm afraid of making it seem like a  big deal and it might be cre