##### Imports

In [None]:
from scripts.data_preparation import *

Progress tracking

In [2]:
# track progress for pipeline processing
tqdm.pandas()

## Data Preparation

### Real data

In [3]:
# lightly processed real user data
df_pandora = pd.read_csv("../original_data/pandora/PANDORA_original.csv", encoding = "utf-8")

In [4]:
# original features
feats = df_pandora.columns.to_list()
print(f"Original features:\n{feats}\n")

# size, features and sample
print(f"Shape:\n{df_pandora.shape}\n")
df_pandora.head(1)

Original features:
['author', 'author_flair_text', 'body', 'downs', 'created_utc', 'subreddit_id', 'link_id', 'parent_id', 'score', 'controversiality', 'gilded', 'id', 'subreddit', 'ups', 'word_count', 'word_count_quoteless', 'gender', 'age', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

Shape:
(2733505, 23)



Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism
0,MetricExpansion,ENTP,Those stats come from the test. [Echoing the c...,0.0,1474429000.0,t5_2s90r,t3_53plrw,t3_53plrw,6.0,0.0,0.0,d7vkyrf,mbti,6.0,151.0,149,m,23.0,70.0,15.0,15.0,30.0,50.0


Empty comments check and removal

In [5]:
# checking for empty comments
print("Check for empty comments")
print(len(df_pandora[df_pandora["body"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print("Check for NaN comments")
print(len(df_pandora[df_pandora["body"].isna()]))

# drop NaN comments
df_pandora = df_pandora[df_pandora["body"].notna()].reset_index(drop = True)

Check for empty comments
0
Check for NaN comments
0


Data types check and casting

In [6]:
# unique dtypes for each column
for c in df_pandora.columns:
    print(f"Unique dtypes {df_pandora.loc[:, c].apply(type).value_counts()}\n")
    
# casting dtypes
df_pandora["author_flair_text"] = df_pandora["author_flair_text"].astype(str)
df_pandora["score"] = df_pandora["score"].astype(float)
df_pandora["word_count"] = df_pandora["word_count"].astype(int)

Unique dtypes author
<class 'str'>    2733505
Name: count, dtype: int64

Unique dtypes author_flair_text
<class 'float'>    1522709
<class 'str'>      1210796
Name: count, dtype: int64

Unique dtypes body
<class 'str'>    2733505
Name: count, dtype: int64

Unique dtypes downs
<class 'float'>    2733505
Name: count, dtype: int64

Unique dtypes created_utc
<class 'float'>    2733505
Name: count, dtype: int64

Unique dtypes subreddit_id
<class 'str'>    2733505
Name: count, dtype: int64

Unique dtypes link_id
<class 'str'>    2733505
Name: count, dtype: int64

Unique dtypes parent_id
<class 'str'>    2733505
Name: count, dtype: int64

Unique dtypes score
<class 'float'>    2733505
Name: count, dtype: int64

Unique dtypes controversiality
<class 'float'>    2733505
Name: count, dtype: int64

Unique dtypes gilded
<class 'float'>    2733505
Name: count, dtype: int64

Unique dtypes id
<class 'str'>    2733505
Name: count, dtype: int64

Unique dtypes subreddit
<class 'str'>    2733505
Name: co

Comments features exploration

In [7]:
analyze_distinct(df_pandora)

Number of distinct values:
author: 1362
author_flair_text: 10891
body: 2468372
downs: 2
created_utc: 2407143
subreddit_id: 15161
link_id: 1208468
parent_id: 2307121
score: 1932
controversiality: 2
gilded: 4
id: 2432399
subreddit: 15161
ups: 1119
word_count: 1683
word_count_quoteless: 1524
gender: 3
age: 41
openness: 109
conscientiousness: 107
extraversion: 109
agreeableness: 108
neuroticism: 109

Distinct values:
downs:
[ 0. nan]
controversiality:
[0. 1.]
gilded:
[0. 1. 2. 3.]
gender:
['m' 'f' 't']


Irrelevant features removal
- a posteriori, we don't consider neither: parent_id, score, id and subreddit, so we can remove them

In [None]:
# features to delete
to_del = ["author_flair_text", "downs", "subreddit_id", "link_id", "controversiality", "gilded", "ups", "word_count", "word_count_quoteless"]

# drop irrelevant columns
df = df_pandora.drop(columns = to_del)
df.head(1)

Unnamed: 0,author,body,created_utc,parent_id,score,id,subreddit,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism
0,MetricExpansion,Those stats come from the test. [Echoing the c...,1474429000.0,t3_53plrw,6.0,d7vkyrf,mbti,m,23.0,70.0,15.0,15.0,30.0,50.0


Time conversion

In [9]:
# convert time from utc to date and time of day features
convert_utc_time(df, "created_utc")

### Simulated data (ex-ante moderations)
- we analyze comments focusing on before moderation content, which we need

In [25]:
# lightly processed simulated (ex-ante) data
df_sim = pd.read_csv("../original_data/simulator/exante/SIMULATOR_exante_original.csv", encoding = "utf-8")

In [26]:
# original features
feats = df_sim.columns.to_list()
print(f"Original features:\n{feats}\n")

# size, features and sample
print(f"Shape:\n{df_sim.shape}\n")
df_sim.head(1)

Original features:
['username', 'comment_id', 'thread_id', 'root_id', 'parent_id', 'node_id', 'b_content', 'a_content_ofsa', 'a_content_neut', 'a_content_emp', 'a_content_pres', 'age', 'gender', 'race', 'income', 'education', 'sex_orientation', 'political_leaning', 'religion', 'agreeableness', 'openness', 'conscientiousness', 'extraversion', 'neuroticism', 'simulate_seed']

Shape:
(3135, 25)



Unnamed: 0,username,comment_id,thread_id,root_id,parent_id,node_id,b_content,a_content_ofsa,a_content_neut,a_content_emp,a_content_pres,age,gender,race,income,education,sex_orientation,political_leaning,religion,agreeableness,openness,conscientiousness,extraversion,neuroticism,simulate_seed
0,joylukclub,2,1,1,1.0,2,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,21,female,white,low,high school,heterosexual,republican,atheist,low,medium,very high,very low,very high,5


Empty comments check and removal

In [27]:
# checking for empty comments
print("Check for empty comments")
print(len(df_sim[df_sim["b_content"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print("Check for NaN comments")
print(len(df_sim[df_sim["b_content"].isna()]))

# drop NaN comments
df_sim = df_sim[df_sim["b_content"].notna()].reset_index(drop = True)

Check for empty comments
0
Check for NaN comments
0


Data types check

In [28]:
# unique dtypes for each column
for c in df_sim.columns:
    print(f"Unique dtypes {df_sim.loc[:, c].apply(type).value_counts()}\n")

Unique dtypes username
<class 'str'>    3135
Name: count, dtype: int64

Unique dtypes comment_id
<class 'int'>    3135
Name: count, dtype: int64

Unique dtypes thread_id
<class 'int'>    3135
Name: count, dtype: int64

Unique dtypes root_id
<class 'int'>    3135
Name: count, dtype: int64

Unique dtypes parent_id
<class 'float'>    3135
Name: count, dtype: int64

Unique dtypes node_id
<class 'int'>    3135
Name: count, dtype: int64

Unique dtypes b_content
<class 'str'>    3135
Name: count, dtype: int64

Unique dtypes a_content_ofsa
<class 'str'>      2675
<class 'float'>     460
Name: count, dtype: int64

Unique dtypes a_content_neut
<class 'str'>      2686
<class 'float'>     449
Name: count, dtype: int64

Unique dtypes a_content_emp
<class 'str'>      2728
<class 'float'>     407
Name: count, dtype: int64

Unique dtypes a_content_pres
<class 'str'>      2710
<class 'float'>     425
Name: count, dtype: int64

Unique dtypes age
<class 'int'>    3135
Name: count, dtype: int64

Unique dt

Comments features exploration

In [29]:
analyze_distinct(df_sim)

Number of distinct values:
username: 30
comment_id: 3135
thread_id: 785
root_id: 1181
parent_id: 817
node_id: 1455
b_content: 3135
a_content_ofsa: 2587
a_content_neut: 2628
a_content_emp: 2632
a_content_pres: 2607
age: 17
gender: 2
race: 4
income: 3
education: 3
sex_orientation: 2
political_leaning: 2
religion: 3
agreeableness: 5
openness: 5
conscientiousness: 4
extraversion: 5
neuroticism: 5
simulate_seed: 3

Distinct values:
age:
[21 18 24 20 39 23 22 37 29 40 51 19 32 31 35 30 28]
gender:
['female' 'male']
race:
['white' 'asian' 'latino' 'black']
income:
['low' 'medium' 'high']
education:
['high school' 'middle school' 'university']
sex_orientation:
['heterosexual' 'homosexual']
political_leaning:
['republican' 'democratic']
religion:
['atheist' 'catholic' 'protestant']
agreeableness:
['low' 'very low' 'very high' 'medium' 'high']
openness:
['medium' 'very low' 'low' 'high' 'very high']
conscientiousness:
['very high' 'very low' 'medium' 'low']
extraversion:
['very low' 'very high' 

### Data formatting and standardization
- the goal is to process the simulated data to make it as similar as possible to the real users'.

In [None]:
# real data viz
df.head(1)

In [30]:
# simulated data viz
df_sim.head(1)

Unnamed: 0,username,comment_id,thread_id,root_id,parent_id,node_id,b_content,a_content_ofsa,a_content_neut,a_content_emp,a_content_pres,age,gender,race,income,education,sex_orientation,political_leaning,religion,agreeableness,openness,conscientiousness,extraversion,neuroticism,simulate_seed
0,joylukclub,2,1,1,1.0,2,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,21,female,white,low,high school,heterosexual,republican,atheist,low,medium,very high,very low,very high,5


Features renaming

In [31]:
# rename features according to real data (as much as possible)
df_sim = df_sim.rename(columns = {"username": "author"})

In [32]:
df_sim.head(1)

Unnamed: 0,author,comment_id,thread_id,root_id,parent_id,node_id,b_content,a_content_ofsa,a_content_neut,a_content_emp,a_content_pres,age,gender,race,income,education,sex_orientation,political_leaning,religion,agreeableness,openness,conscientiousness,extraversion,neuroticism,simulate_seed
0,joylukclub,2,1,1,1.0,2,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,21,female,white,low,high school,heterosexual,republican,atheist,low,medium,very high,very low,very high,5


Features rearrangement

In [None]:
df = df[["author", "body", "gender", "age", "openness", "conscientiousness", 
         "extraversion", "agreeableness", "neuroticism", "score", "subreddit",
         "id", "parent_id", "date", "time_of_day"]]

In [33]:
df_sim = df_sim[["author", "comment_id", "b_content", "a_content_ofsa", "a_content_neut", "a_content_emp", "a_content_pres", 
                 "gender", "age", "openness", "conscientiousness", "extraversion", "agreeableness", 
                 "neuroticism", "thread_id", "node_id", "parent_id", "root_id", "race", "income", "education", 
                 "sex_orientation", "political_leaning", "religion", "simulate_seed"]]

In [38]:
df.head(1)

Unnamed: 0,author,body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,score,subreddit,id,parent_id,date,time_of_day
0,MetricExpansion,Those stats come from the test. [Echoing the c...,m,23.0,70.0,15.0,15.0,30.0,50.0,6.0,mbti,d7vkyrf,t3_53plrw,2016-09-21,03:41:38


In [34]:
df_sim.head(1)

Unnamed: 0,author,comment_id,b_content,a_content_ofsa,a_content_neut,a_content_emp,a_content_pres,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed
0,joylukclub,2,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,female,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5


Data representation

In [35]:
# homogeneous values across datasets (gender)
df_sim["gender"] = df_sim["gender"].apply(lambda x: "f" if x == "female" else "m")

##### Discretization of OCEAN features
- since the simulated data has OCEAN features discretized, we do the same in the real users' data

In [23]:
# bins
bins = [0, 20, 40, 60, 80, 100]
# labels
labels = ["very low", "low", "medium", "high", "very high"]

# discretization of OCEAN
ocean = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

for c in ocean:
    df[c] = pd.cut(df[c], bins = bins, labels = labels, right = True, include_lowest = True)

In [24]:
df.head(1)

Unnamed: 0,author,body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,score,subreddit,id,parent_id,date,time_of_day
0,MetricExpansion,Those stats come from the test. [Echoing the c...,m,23.0,high,very low,very low,low,medium,6.0,mbti,d7vkyrf,t3_53plrw,2016-09-21,03:41:38


### Before moderation dataset
- we only consider the comments before moderation
- we remove information about all a_contents (for each moderation)
- we remove duplicates for b_content
- we remove missing values if any

In [36]:
# before moderation dataset
df_bef = df_sim.drop(columns = ["a_content_ofsa", "a_content_neut", "a_content_emp", "a_content_pres"])
# rename the b_content into body consistently to the real data
df_bef.rename(columns = {"b_content": "body"}, inplace = True)

print(f"Shape:\n{df_bef.shape}")
df_bef.head(1)

Shape:
(3135, 21)


Unnamed: 0,author,comment_id,body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed
0,joylukclub,2,Since I strongly lean towards the republican s...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5


In [37]:
# remove duplicates and missing values of the content before moderation
process_dataset(df_bef, "body")

Shape after duplicates removal:
(3135, 21)
Check for empty comments
0
Check for NaN comments
0
Shape:
(3135, 21)



### After moderation datasets
- we only consider the comments after moderation, by type of moderation
- we remove information about the b_content, focusing on after moderation comments
- we remove duplicates for a_content, but based on the type of moderation
- we remove missing values if any

In [38]:
# after moderation datasets
df_ofsa = df_sim.drop(columns = ["b_content", "a_content_neut", "a_content_emp", "a_content_pres"])
df_neut = df_sim.drop(columns = ["b_content", "a_content_ofsa", "a_content_emp", "a_content_pres"])
df_emp = df_sim.drop(columns = ["b_content", "a_content_neut", "a_content_ofsa", "a_content_pres"])
df_pres = df_sim.drop(columns = ["b_content", "a_content_neut", "a_content_emp", "a_content_ofsa"])

# rename the a_content into body consistently to the real data
df_ofsa.rename(columns = {"a_content_ofsa": "body"}, inplace = True)
df_neut.rename(columns = {"a_content_neut": "body"}, inplace = True)
df_emp.rename(columns = {"a_content_emp": "body"}, inplace = True)
df_pres.rename(columns = {"a_content_pres": "body"}, inplace = True)

print(f"Shape ofsa:\n{df_ofsa.shape}")
print(f"Shape neut:\n{df_neut.shape}")
print(f"Shape emp:\n{df_emp.shape}")
print(f"Shape pres:\n{df_pres.shape}")
df_ofsa.head(1)

Shape ofsa:
(3135, 21)
Shape neut:
(3135, 21)
Shape emp:
(3135, 21)
Shape pres:
(3135, 21)


Unnamed: 0,author,comment_id,body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed
0,joylukclub,2,Since I strongly lean towards the republican s...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5


In [39]:
# processing datasets after moderations
print("ofsa")
process_dataset(df_ofsa, "body")

print("neutral")
process_dataset(df_neut, "body")

print("empathizing")
process_dataset(df_emp, "body")

print("prescriptive")
process_dataset(df_pres, "body")

ofsa
Shape after duplicates removal:
(2587, 21)
Check for empty comments
0
Check for NaN comments
1
Shape:
(2586, 21)

neutral
Shape after duplicates removal:
(2628, 21)
Check for empty comments
0
Check for NaN comments
1
Shape:
(2627, 21)

empathizing
Shape after duplicates removal:
(2632, 21)
Check for empty comments
0
Check for NaN comments
1
Shape:
(2631, 21)

prescriptive
Shape after duplicates removal:
(2607, 21)
Check for empty comments
0
Check for NaN comments
1
Shape:
(2606, 21)



## Comments pre-processing

General cleansing of textual data.
- html, urls, mentions, hashtags, replacement symbols, contractions extension, extra spaces removal

In [None]:
# real comments
general_text_cleaning(df, "body")

In [42]:
# simulated comments
# before moderation
general_text_cleaning(df_bef, "body")

# after moderation
after_datasets = [df_ofsa, df_neut, df_emp, df_pres]

for d in after_datasets:
    general_text_cleaning(d, "body")

  0%|          | 0/3135 [00:00<?, ?it/s]

100%|██████████| 3135/3135 [00:00<00:00, 17827.32it/s]


Text cleaned.



100%|██████████| 2587/2587 [00:00<00:00, 20086.57it/s]


Text cleaned.



100%|██████████| 2628/2628 [00:00<00:00, 19962.42it/s]


Text cleaned.



100%|██████████| 2632/2632 [00:00<00:00, 18857.24it/s]


Text cleaned.



100%|██████████| 2607/2607 [00:00<00:00, 19073.70it/s]

Text cleaned.






Check empty real data

In [None]:
check_empty(df, "body")

Check empty simulated data

In [43]:
check_empty(df_bef, "body")
check_empty(df_ofsa, "body")
check_empty(df_neut, "body")
check_empty(df_emp, "body")
check_empty(df_pres, "body")

Empty:
0
Empty:
1
Empty:
1
Empty:
1
Empty:
1


##### Separate features for different pipelines

- One comment column will be preprocessed lightly to make it suitable for LLMs.
- The other is processed to extract transparent (e.g. linguistic, affective, toxicity-related) features.

In [None]:
# duplicate comments column, one for llm pipeline
# real
df.insert(loc = 1, column = "llm_body", value = df["body"])
# simulated
df_bef.insert(loc = 2, column = "llm_body", value = df_bef["body"])
df_ofsa.insert(loc = 2, column = "llm_body", value = df_ofsa["body"])
df_neut.insert(loc = 2, column = "llm_body", value = df_neut["body"])
df_emp.insert(loc = 2, column = "llm_body", value = df_emp["body"])
df_pres.insert(loc = 2, column = "llm_body", value = df_pres["body"])

# rename comments column for feature extraction pipeline
# real
df = df.rename(columns = {"body":"std_body"})
# simulated
df_bef.rename(columns = {"body":"std_body"}, inplace = True)
df_ofsa.rename(columns = {"body":"std_body"}, inplace = True)
df_neut.rename(columns = {"body":"std_body"}, inplace = True)
df_emp.rename(columns = {"body":"std_body"}, inplace = True)
df_pres.rename(columns = {"body":"std_body"}, inplace = True)

In [None]:
# shapes
print(f"Shape real data:\n{df.shape}\n")

print(f"Shape simulated data before mod:\n{df_bef.shape}\n")
print(f"Shape simulated data after ofsa:\n{df_ofsa.shape}\n")
print(f"Shape simulated data after neutral:\n{df_neut.shape}\n")
print(f"Shape simulated data after empathizing:\n{df_emp.shape}\n")
print(f"Shape simulated data after prescriptive:\n{df_pres.shape}\n")

Shape simulated data before mod:
(3135, 22)

Shape simulated data after ofsa:
(2587, 22)

Shape simulated data after neutral:
(2628, 22)

Shape simulated data after empathizing:
(2632, 22)

Shape simulated data after prescriptive:
(2607, 22)



In [32]:
df.head(1)

Unnamed: 0,author,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,score,subreddit,id,parent_id,date,time_of_day
0,MetricExpansion,Those stats come from the test. [Echoing the c...,Those stats come from the test. [Echoing the c...,m,23.0,high,very low,very low,low,medium,6.0,mbti,d7vkyrf,t3_53plrw,2016-09-21,03:41:38


In [48]:
df_bef.head(1)

Unnamed: 0,author,comment_id,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed
0,joylukclub,2,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5


### Storing the datasets

In [None]:
# store full real and simulated datasets
df.to_csv("../original_data/pandora/PANDORA.csv", index = False, encoding = "utf-8") 
df_bef.to_csv("../original_data/simulator/exante/SIMULATOR_exante_bef.csv", index = False, encoding = "utf-8")
df_ofsa.to_csv("../original_data/simulator/exante/SIMULATOR_exante_ofsa.csv", index = False, encoding = "utf-8")
df_neut.to_csv("../original_data/simulator/exante/SIMULATOR_exante_neut.csv", index = False, encoding = "utf-8")
df_emp.to_csv("../original_data/simulator/exante/SIMULATOR_exante_emp.csv", index = False, encoding = "utf-8")
df_pres.to_csv("../original_data/simulator/exante/SIMULATOR_exante_pres.csv", index = False, encoding = "utf-8")