##### Imports

In [None]:
from scripts.feature_extraction import *
from scripts.plotting import *

Progress tracking

In [2]:
# track progress for pipeline processing
tqdm.pandas()

## Feature extraction pipeline (no toxicity)

### Real data

In [None]:
# load real users' data
df_real = pd.read_csv("../original_data/pandora/PANDORA.csv", encoding = "utf-8") 

In [4]:
# original features
print(f"Features:\n{df_real.columns.to_list()}")

# size, features and sample
print(f"Shape of dataset:\n{df_real.shape}\n")
df_real.head(1)

Features:
['author', 'llm_body', 'std_body', 'gender', 'age', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism', 'score', 'subreddit', 'id', 'parent_id', 'date', 'time_of_day']
Shape of dataset:
(2722375, 16)



Unnamed: 0,author,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,score,subreddit,id,parent_id,date,time_of_day
0,MetricExpansion,Those stats come from the test. [Echoing the c...,Those stats come from the test. [Echoing the c...,m,23.0,high,very low,very low,low,medium,6.0,mbti,d7vkyrf,t3_53plrw,2016-09-21,03:41:38


##### Pipeline on real comments

In [5]:
# feature with comments' content before moderation
text_col = "std_body"

In [6]:
# pipeline for processing text and extracting features
pipeline = [extract_counts, extract_emoji_counts, process_emojis, fix_encoding, 
            extract_emotions, extract_pol_subj, extract_VAD, extract_readability, 
            clean_text, lowercase, extract_word_counts, process_stopwords, lemmatization]

In [None]:
# execute pipeline on the comments before moderation
apply_pipeline(df_real, text_col, pipeline)

100%|██████████| 2722375/2722375 [00:27<00:00, 97730.63it/s] 
100%|██████████| 2722375/2722375 [01:38<00:00, 27674.41it/s]
100%|██████████| 2722375/2722375 [09:59<00:00, 4537.61it/s] 


Counts of punctuation, sentences and uppercase words extracted.



100%|██████████| 2722375/2722375 [21:18<00:00, 2129.98it/s] 


Emoji and emoticon counts extracted.



100%|██████████| 2722375/2722375 [24:49<00:00, 1828.24it/s] 
100%|██████████| 2722375/2722375 [07:42<00:00, 5887.38it/s] 


Emojis processed.



100%|██████████| 2722375/2722375 [00:03<00:00, 837592.40it/s]


Text encoding fixed.



100%|██████████| 2722375/2722375 [28:38<00:00, 1583.99it/s] 


Emotions scores extracted.



100%|██████████| 2722375/2722375 [17:01<00:00, 2664.96it/s] 


Polarity and subjectivity extracted.



100%|██████████| 2722375/2722375 [40:10<00:00, 1129.46it/s] 


VAD extracted.



100%|██████████| 2722375/2722375 [22:38<00:00, 2004.31it/s] 


Readability scores extracted.



100%|██████████| 2722375/2722375 [01:10<00:00, 38427.43it/s]


Text cleaned.



100%|██████████| 2722375/2722375 [00:02<00:00, 995827.20it/s] 


Lowercasing done.



100%|██████████| 2722375/2722375 [05:17<00:00, 8582.76it/s] 
100%|██████████| 2722375/2722375 [05:27<00:00, 8306.18it/s] 
100%|██████████| 2722375/2722375 [1:06:24<00:00, 683.31it/s] 


Word counts retrieved.



100%|██████████| 2722375/2722375 [05:25<00:00, 8358.64it/s] 
100%|██████████| 2722375/2722375 [05:29<00:00, 8270.21it/s] 


Stopwords counted and removed.



100%|██████████| 2722375/2722375 [45:29<00:00, 997.49it/s]  


Lemmatization performed.


PIPELINE APPLIED.



In [8]:
# visualization
print(f"Shape:\n{df_real.shape}")
df_real.head(1)

Shape:
(2722375, 55)


Unnamed: 0,author,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,score,subreddit,id,parent_id,date,time_of_day,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw
0,MetricExpansion,Those stats come from the test. [Echoing the c...,stats come test echo comment make related ques...,m,23.0,high,very low,very low,low,medium,6.0,mbti,d7vkyrf,t3_53plrw,2016-09-21,03:41:38,25,4,8,0,0,0,,,0.03,0.03,0.0,0.21,0.07,0.41,0.07,0.0,0.0,0.07,0.108117,0.524675,0.135211,0.087877,0.124749,45.59,13.2,14.64,14.3,14.7,11.09,8.62,12.4,30.0,160,94,19,37,31,87,85


##### Post-processing

Missing values

In [None]:
# missing values for the processed text
print(f"Missing values for {text_col}:")
print(len(df_real[df_real[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_real[df_real[text_col].isna()]))

# checking starting text to see why they are empty (should be, OK)
df_real[df_real[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].iloc[:10, 1]

Missing values for std_body:
11704
0


1388    to you and i both...
1429           So have at it
1683                    same
1684                    same
1685                    same
1824                      :D
1825                      :D
1826                      :D
1827                      :D
1828                      :D
Name: llm_body, dtype: object

In [10]:
# removing empty comments and resetting index
df_clean = df_real[df_real[text_col].notna()].reset_index(drop = True)
df_clean = df_clean[~df_clean[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].reset_index(drop = True)

print(f"Size before missing values removal: {df_real.shape}")
print(f"Size after missing values removal: {df_clean.shape}")

Size before missing values removal: (2722375, 55)
Size after missing values removal: (2710671, 55)


Missing emojis set (decided to keep them as empty strings)

In [11]:
# checking for empty emoji list and set
print("Missing emojis:")
print(len(df_clean[df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_clean[df_clean["emoji_unique"].isna()]))

print("\nFound emojis:")
df_clean[~df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]["emoji_unique"][:10]

Missing emojis:
2624815
0

Found emojis:


2         :D
19        :D
22       :'(
26        :)
27        :)
28        :)
31        :(
48        :)
84     :( :)
91        :D
Name: emoji_unique, dtype: object

Latin alphabet
- 339 comments were recognized as non-latin. Inspecting them, most were normal, so we decided to keep them

In [12]:
# apply function to the column
df_clean["is_latin"] = df_clean["llm_body"].progress_apply(is_latin)

# number of latin and non-latin comments
print(df_clean["is_latin"].value_counts())

# remove is_latin feature
df_clean.drop(columns = "is_latin", inplace = True)

100%|██████████| 2710671/2710671 [02:24<00:00, 18777.40it/s]


is_latin
True     2710332
False        339
Name: count, dtype: int64


In [None]:
# store the dataset
#df_clean.to_csv("../data/pandora/PANDORA_featextr.csv", index = False, encoding = "utf-8")

Reset kernel (RAM)

In [14]:
%reset -f

### Simulated data (ex-ante)

In [1]:
from scripts.feature_extraction import *
from scripts.plotting import *

In [2]:
# track progress for pipeline processing
tqdm.pandas()

In [3]:
# simulated data before moderation
df_bef = pd.read_csv("../original_data/simulator/exante/SIMULATOR_exante_bef.csv", encoding = "utf-8")
# ofsa
df_ofsa = pd.read_csv("../original_data/simulator/exante/SIMULATOR_exante_ofsa.csv", encoding = "utf-8")
# neutral
df_neut = pd.read_csv("../original_data/simulator/exante/SIMULATOR_exante_neut.csv", encoding = "utf-8")
# empathizing
df_emp = pd.read_csv("../original_data/simulator/exante/SIMULATOR_exante_emp.csv", encoding = "utf-8")
# prescriptive
df_pres = pd.read_csv("../original_data/simulator/exante/SIMULATOR_exante_pres.csv", encoding = "utf-8")

In [4]:
# drop NaN comments
df_bef.dropna(subset = ["std_body"], inplace = True)
df_ofsa.dropna(subset = ["std_body"], inplace = True)
df_neut.dropna(subset = ["std_body"], inplace = True)
df_emp.dropna(subset = ["std_body"], inplace = True)
df_pres.dropna(subset = ["std_body"], inplace = True)

In [5]:
# size, features and sample
print(f"Shape of dataset before mod:\n{df_bef.shape}\n")
print(f"Shape of dataset ofsa mod:\n{df_ofsa.shape}\n")
print(f"Shape of dataset neutral mod:\n{df_neut.shape}\n")
print(f"Shape of dataset empathizing mod:\n{df_emp.shape}\n")
print(f"Shape of dataset prescriptive mod:\n{df_pres.shape}\n")
df_bef.head(1)

Shape of dataset before mod:
(3135, 22)

Shape of dataset ofsa mod:
(2586, 22)

Shape of dataset neutral mod:
(2627, 22)

Shape of dataset empathizing mod:
(2631, 22)

Shape of dataset prescriptive mod:
(2606, 22)



Unnamed: 0,author,comment_id,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed
0,joylukclub,2,Since I strongly lean towards the republican s...,Since I strongly lean towards the republican s...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5


##### Pipeline on content before moderation

In [6]:
# feature with comments' content before moderation
text_col = "std_body"

In [7]:
# pipeline for processing text and extracting features
pipeline = [extract_counts, extract_emoji_counts, process_emojis, fix_encoding, 
            extract_emotions, extract_pol_subj, extract_VAD, extract_readability, 
            clean_text, lowercase, extract_word_counts, process_stopwords, lemmatization]

In [8]:
# execute pipeline on the comments before moderation
apply_pipeline(df_bef, text_col, pipeline)

100%|██████████| 3135/3135 [00:00<00:00, 51251.33it/s]
100%|██████████| 3135/3135 [00:00<00:00, 10888.18it/s]
100%|██████████| 3135/3135 [00:01<00:00, 2369.80it/s]


Counts of punctuation, sentences and uppercase words extracted.



100%|██████████| 3135/3135 [00:02<00:00, 1502.66it/s]


Emoji and emoticon counts extracted.



100%|██████████| 3135/3135 [00:02<00:00, 1226.09it/s]
100%|██████████| 3135/3135 [00:00<00:00, 3378.11it/s]


Emojis processed.



100%|██████████| 3135/3135 [00:00<00:00, 627434.42it/s]


Text encoding fixed.



100%|██████████| 3135/3135 [00:03<00:00, 1036.45it/s]


Emotions scores extracted.



100%|██████████| 3135/3135 [00:01<00:00, 1878.77it/s]


Polarity and subjectivity extracted.



100%|██████████| 3135/3135 [00:04<00:00, 665.27it/s]


VAD extracted.



100%|██████████| 3135/3135 [00:02<00:00, 1349.52it/s]


Readability scores extracted.



100%|██████████| 3135/3135 [00:00<00:00, 24936.36it/s]


Text cleaned.



100%|██████████| 3135/3135 [00:00<00:00, 778332.13it/s]


Lowercasing done.



100%|██████████| 3135/3135 [00:00<00:00, 5225.62it/s]
100%|██████████| 3135/3135 [00:00<00:00, 5185.35it/s]
100%|██████████| 3135/3135 [00:07<00:00, 408.08it/s]


Word counts retrieved.



100%|██████████| 3135/3135 [00:00<00:00, 5285.05it/s]
100%|██████████| 3135/3135 [00:00<00:00, 5318.23it/s]


Stopwords counted and removed.



100%|██████████| 3135/3135 [00:08<00:00, 384.61it/s]

Lemmatization performed.


PIPELINE APPLIED.






In [9]:
# visualization
print(f"Shape:\n{df_bef.shape}")
df_bef.head(1)

Shape:
(3135, 61)


Unnamed: 0,author,comment_id,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw
0,joylukclub,2,Since I strongly lean towards the republican s...,since strongly lean towards republican side wh...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22


##### Post-processing

Missing values

In [10]:
# missing values for the processed text
print(f"Missing values for {text_col}:")
print(len(df_bef[df_bef[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_bef[df_bef[text_col].isna()]))

# checking starting text to see why they are empty (should be, OK)
df_bef[df_bef[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].iloc[:10, 1]

Missing values for std_body:
4
0


7        10
164     233
721    1040
821    1201
Name: comment_id, dtype: int64

In [11]:
# removing empty comments and resetting index
df_clean = df_bef[df_bef[text_col].notna()].reset_index(drop = True)
df_clean = df_clean[~df_clean[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].reset_index(drop = True)

print(f"Size before missing values removal: {df_bef.shape}")
print(f"Size after missing values removal: {df_clean.shape}")

Size before missing values removal: (3135, 61)
Size after missing values removal: (3131, 61)


Missing emojis set (decided to keep them as empty strings)

In [12]:
# checking for empty emoji list and set
print("Missing emojis:")
print(len(df_clean[df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_clean[df_clean["emoji_unique"].isna()]))

print("\nFound emojis:")
df_clean[~df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]["emoji_unique"][:10]

Missing emojis:
3125
0

Found emojis:


295                                         :<
1085                                        :3
1361                   :white_flag: :rainbow: 
1509    :white_flag: :rainbow: :glowing_star: 
1610    :white_flag: :rainbow: :glowing_star: 
1661    :smiling_cat_with_heart-eyes: :crown: 
Name: emoji_unique, dtype: object

Latin alphabet

In [13]:
# apply function to the column
df_clean["is_latin"] = df_clean["llm_body"].progress_apply(is_latin)

# number of latin and non-latin comments
print(df_clean["is_latin"].value_counts())

# remove is_latin feature
df_clean.drop(columns = "is_latin", inplace = True)

100%|██████████| 3131/3131 [00:00<00:00, 10462.10it/s]

is_latin
True    3131
Name: count, dtype: int64





In [14]:
# store the dataset
df_clean.to_csv("../data/simulator/exante/before_mod/SIMULATOR_exante_bef_featextr.csv", index = False, encoding = "utf-8")

##### Pipeline on content after OFSA

In [15]:
# execute pipeline on the comments before moderation
apply_pipeline(df_ofsa, text_col, pipeline)

100%|██████████| 2586/2586 [00:00<00:00, 61869.88it/s]
100%|██████████| 2586/2586 [00:00<00:00, 15948.96it/s]
100%|██████████| 2586/2586 [00:00<00:00, 2843.04it/s]


Counts of punctuation, sentences and uppercase words extracted.



100%|██████████| 2586/2586 [00:01<00:00, 1793.54it/s]


Emoji and emoticon counts extracted.



100%|██████████| 2586/2586 [00:02<00:00, 1249.26it/s]
100%|██████████| 2586/2586 [00:00<00:00, 3934.52it/s]


Emojis processed.



100%|██████████| 2586/2586 [00:00<00:00, 727755.65it/s]


Text encoding fixed.



100%|██████████| 2586/2586 [00:02<00:00, 1199.28it/s]


Emotions scores extracted.



100%|██████████| 2586/2586 [00:01<00:00, 2260.89it/s]


Polarity and subjectivity extracted.



100%|██████████| 2586/2586 [00:03<00:00, 736.87it/s]


VAD extracted.



100%|██████████| 2586/2586 [00:01<00:00, 1552.03it/s]


Readability scores extracted.



100%|██████████| 2586/2586 [00:00<00:00, 28118.97it/s]


Text cleaned.



100%|██████████| 2586/2586 [00:00<00:00, 576663.84it/s]


Lowercasing done.



100%|██████████| 2586/2586 [00:00<00:00, 5710.29it/s]
100%|██████████| 2586/2586 [00:00<00:00, 6042.24it/s]
100%|██████████| 2586/2586 [00:05<00:00, 489.12it/s]


Word counts retrieved.



100%|██████████| 2586/2586 [00:00<00:00, 6036.09it/s]
100%|██████████| 2586/2586 [00:00<00:00, 5920.19it/s]


Stopwords counted and removed.



100%|██████████| 2586/2586 [00:03<00:00, 671.85it/s]

Lemmatization performed.


PIPELINE APPLIED.






In [16]:
# visualization
print(f"Shape:\n{df_ofsa.shape}")
df_ofsa.head(1)

Shape:
(2586, 61)


Unnamed: 0,author,comment_id,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw
0,joylukclub,2,Since I strongly lean towards the republican s...,since strongly lean towards republican side wh...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22


##### Post-processing

Missing values

In [17]:
# missing values for the processed text
print(f"Missing values for {text_col}:")
print(len(df_ofsa[df_ofsa[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_ofsa[df_ofsa[text_col].isna()]))

# checking starting text to see why they are empty (should be, OK)
df_ofsa[df_ofsa[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].iloc[:10, 1]

Missing values for std_body:
7
0


7         10
155      233
209      318
626     1040
707     1201
726     1231
1381    2342
Name: comment_id, dtype: int64

In [18]:
# removing empty comments and resetting index
df_clean = df_ofsa[df_ofsa[text_col].notna()].reset_index(drop = True)
df_clean = df_clean[~df_clean[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].reset_index(drop = True)

print(f"Size before missing values removal: {df_ofsa.shape}")
print(f"Size after missing values removal: {df_clean.shape}")

Size before missing values removal: (2586, 61)
Size after missing values removal: (2579, 61)


Missing emojis set (decided to keep them as empty strings)

In [19]:
# checking for empty emoji list and set
print("Missing emojis:")
print(len(df_clean[df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_clean[df_clean["emoji_unique"].isna()]))

print("\nFound emojis:")
df_clean[~df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]["emoji_unique"][:10]

Missing emojis:
2569
0

Found emojis:


287                                                   :-)
468                                                    :<
469                                                    :]
909                                                    :3
1277               :white_flag: :rainbow: :glowing_star: 
1360               :white_flag: :rainbow: :glowing_star: 
1388             :zany_face: :face_vomiting: :hamburger: 
1400    :face_with_symbols_on_mouth: :face_vomiting: :...
2065                          :winking_face_with_tongue: 
2574                                                   :D
Name: emoji_unique, dtype: object

Latin alphabet

In [20]:
# apply function to the column
df_clean["is_latin"] = df_clean["llm_body"].progress_apply(is_latin)

# number of latin and non-latin comments
print(df_clean["is_latin"].value_counts())

# remove is_latin feature
df_clean.drop(columns = "is_latin", inplace = True)

100%|██████████| 2579/2579 [00:00<00:00, 12232.76it/s]

is_latin
True    2579
Name: count, dtype: int64





In [21]:
# store the dataset
df_clean.to_csv("../data/simulator/exante/after_mod/SIMULATOR_exante_ofsa_featextr.csv", index = False, encoding = "utf-8")

##### Pipeline on content after Neutral

In [22]:
# execute pipeline on the comments before moderation
apply_pipeline(df_neut, text_col, pipeline)

100%|██████████| 2627/2627 [00:00<00:00, 60364.08it/s]
100%|██████████| 2627/2627 [00:00<00:00, 15955.06it/s]
100%|██████████| 2627/2627 [00:00<00:00, 2796.84it/s]


Counts of punctuation, sentences and uppercase words extracted.



100%|██████████| 2627/2627 [00:01<00:00, 1803.22it/s]


Emoji and emoticon counts extracted.



100%|██████████| 2627/2627 [00:02<00:00, 1252.54it/s]
100%|██████████| 2627/2627 [00:00<00:00, 3828.58it/s]


Emojis processed.



100%|██████████| 2627/2627 [00:00<00:00, 873993.54it/s]


Text encoding fixed.



100%|██████████| 2627/2627 [00:02<00:00, 1159.55it/s]


Emotions scores extracted.



100%|██████████| 2627/2627 [00:01<00:00, 2238.94it/s]


Polarity and subjectivity extracted.



100%|██████████| 2627/2627 [00:03<00:00, 750.64it/s]


VAD extracted.



100%|██████████| 2627/2627 [00:01<00:00, 1448.87it/s]


Readability scores extracted.



100%|██████████| 2627/2627 [00:00<00:00, 27313.45it/s]


Text cleaned.



100%|██████████| 2627/2627 [00:00<00:00, 949701.48it/s]


Lowercasing done.



100%|██████████| 2627/2627 [00:00<00:00, 6152.43it/s]
100%|██████████| 2627/2627 [00:00<00:00, 5908.30it/s]
100%|██████████| 2627/2627 [00:05<00:00, 477.28it/s]


Word counts retrieved.



100%|██████████| 2627/2627 [00:00<00:00, 5944.95it/s]
100%|██████████| 2627/2627 [00:00<00:00, 5694.73it/s]


Stopwords counted and removed.



100%|██████████| 2627/2627 [00:03<00:00, 664.76it/s]

Lemmatization performed.


PIPELINE APPLIED.






In [23]:
# visualization
print(f"Shape:\n{df_neut.shape}")
df_neut.head(1)

Shape:
(2627, 61)


Unnamed: 0,author,comment_id,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw
0,joylukclub,2,Since I strongly lean towards the republican s...,since strongly lean towards republican side wh...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22


##### Post-processing

Missing values

In [24]:
# missing values for the processed text
print(f"Missing values for {text_col}:")
print(len(df_neut[df_neut[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_neut[df_neut[text_col].isna()]))

# checking starting text to see why they are empty (should be, OK)
df_neut[df_neut[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].iloc[:10, 1]

Missing values for std_body:
7
0


7        10
161     233
213     318
344     544
629    1031
634    1040
713    1201
Name: comment_id, dtype: int64

In [25]:
# removing empty comments and resetting index
df_clean = df_neut[df_neut[text_col].notna()].reset_index(drop = True)
df_clean = df_clean[~df_clean[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].reset_index(drop = True)

print(f"Size before missing values removal: {df_neut.shape}")
print(f"Size after missing values removal: {df_clean.shape}")

Size before missing values removal: (2627, 61)
Size after missing values removal: (2620, 61)


Missing emojis set (decided to keep them as empty strings)

In [26]:
# checking for empty emoji list and set
print("Missing emojis:")
print(len(df_clean[df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_clean[df_clean["emoji_unique"].isna()]))

print("\nFound emojis:")
df_clean[~df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]["emoji_unique"][:10]

Missing emojis:
2610
0

Found emojis:


157                                                    8)
474                                                    :<
530                        :rainbow: :heart_suit: :rose: 
656                                                    :[
920                                                    :3
1235                                              :fire: 
1294               :white_flag: :rainbow: :glowing_star: 
1375               :white_flag: :rainbow: :glowing_star: 
2129    :flexed_biceps_dark_skin_tone: :woman_lifting_...
2212                                :woman_raising_hand: 
Name: emoji_unique, dtype: object

Latin alphabet

In [27]:
# apply function to the column
df_clean["is_latin"] = df_clean["llm_body"].progress_apply(is_latin)

# number of latin and non-latin comments
print(df_clean["is_latin"].value_counts())

# remove is_latin feature
df_clean.drop(columns = "is_latin", inplace = True)

100%|██████████| 2620/2620 [00:00<00:00, 12300.49it/s]

is_latin
True    2620
Name: count, dtype: int64





In [28]:
# store the dataset
df_clean.to_csv("../data/simulator/exante/after_mod/SIMULATOR_exante_neut_featextr.csv", index = False, encoding = "utf-8")

##### Pipeline on content after Empathizing

In [29]:
# execute pipeline on the comments before moderation
apply_pipeline(df_emp, text_col, pipeline)

100%|██████████| 2631/2631 [00:00<00:00, 57383.62it/s]
100%|██████████| 2631/2631 [00:00<00:00, 15471.71it/s]
100%|██████████| 2631/2631 [00:00<00:00, 2662.14it/s]


Counts of punctuation, sentences and uppercase words extracted.



100%|██████████| 2631/2631 [00:01<00:00, 1679.30it/s]


Emoji and emoticon counts extracted.



100%|██████████| 2631/2631 [00:01<00:00, 1317.05it/s]
100%|██████████| 2631/2631 [00:00<00:00, 3519.38it/s]


Emojis processed.



100%|██████████| 2631/2631 [00:00<00:00, 657640.87it/s]


Text encoding fixed.



100%|██████████| 2631/2631 [00:02<00:00, 1056.53it/s]


Emotions scores extracted.



100%|██████████| 2631/2631 [00:01<00:00, 2173.57it/s]


Polarity and subjectivity extracted.



100%|██████████| 2631/2631 [00:03<00:00, 698.14it/s]


VAD extracted.



100%|██████████| 2631/2631 [00:01<00:00, 1420.62it/s]


Readability scores extracted.



100%|██████████| 2631/2631 [00:00<00:00, 27005.53it/s]


Text cleaned.



100%|██████████| 2631/2631 [00:00<00:00, 736466.49it/s]


Lowercasing done.



100%|██████████| 2631/2631 [00:00<00:00, 5798.43it/s]
100%|██████████| 2631/2631 [00:00<00:00, 5604.07it/s]
100%|██████████| 2631/2631 [00:05<00:00, 450.69it/s]


Word counts retrieved.



100%|██████████| 2631/2631 [00:00<00:00, 5608.74it/s]
100%|██████████| 2631/2631 [00:00<00:00, 5646.66it/s]


Stopwords counted and removed.



100%|██████████| 2631/2631 [00:04<00:00, 607.06it/s]

Lemmatization performed.


PIPELINE APPLIED.






In [30]:
# visualization
print(f"Shape:\n{df_emp.shape}")
df_emp.head(1)

Shape:
(2631, 61)


Unnamed: 0,author,comment_id,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw
0,joylukclub,2,Since I strongly lean towards the republican s...,since strongly lean towards republican side wh...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22


##### Post-processing

Missing values

In [31]:
# missing values for the processed text
print(f"Missing values for {text_col}:")
print(len(df_emp[df_emp[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_emp[df_emp[text_col].isna()]))

# checking starting text to see why they are empty (should be, OK)
df_emp[df_emp[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].iloc[:10, 1]

Missing values for std_body:
7
0


7        10
158     233
244     366
336     522
628    1031
633    1040
719    1201
Name: comment_id, dtype: int64

In [32]:
# removing empty comments and resetting index
df_clean = df_emp[df_emp[text_col].notna()].reset_index(drop = True)
df_clean = df_clean[~df_clean[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].reset_index(drop = True)

print(f"Size before missing values removal: {df_emp.shape}")
print(f"Size after missing values removal: {df_clean.shape}")

Size before missing values removal: (2631, 61)
Size after missing values removal: (2624, 61)


Missing emojis set (decided to keep them as empty strings)

In [33]:
# checking for empty emoji list and set
print("Missing emojis:")
print(len(df_clean[df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_clean[df_clean["emoji_unique"].isna()]))

print("\nFound emojis:")
df_clean[~df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]["emoji_unique"][:10]

Missing emojis:
2617
0

Found emojis:


379     :wine_glass: :globe_showing_Americas: :kiss_ma...
476                                                    :<
929                                                    :3
1262                                                   :]
1297     :musical_note: :fire: :smiling_face_with_horns: 
1307               :white_flag: :rainbow: :glowing_star: 
1387               :white_flag: :rainbow: :glowing_star: 
Name: emoji_unique, dtype: object

Latin alphabet

In [34]:
# apply function to the column
df_clean["is_latin"] = df_clean["llm_body"].progress_apply(is_latin)

# number of latin and non-latin comments
print(df_clean["is_latin"].value_counts())

# remove is_latin feature
df_clean.drop(columns = "is_latin", inplace = True)

100%|██████████| 2624/2624 [00:00<00:00, 10670.03it/s]

is_latin
True    2624
Name: count, dtype: int64





In [35]:
# store the dataset
df_clean.to_csv("../data/simulator/exante/after_mod/SIMULATOR_exante_emp_featextr.csv", index = False, encoding = "utf-8")

##### Pipeline on content after Prescriptive

In [36]:
# execute pipeline on the comments before moderation
apply_pipeline(df_pres, text_col, pipeline)

100%|██████████| 2606/2606 [00:00<00:00, 61378.56it/s]
100%|██████████| 2606/2606 [00:00<00:00, 15162.01it/s]
100%|██████████| 2606/2606 [00:00<00:00, 2742.71it/s]


Counts of punctuation, sentences and uppercase words extracted.



100%|██████████| 2606/2606 [00:01<00:00, 1726.97it/s]


Emoji and emoticon counts extracted.



100%|██████████| 2606/2606 [00:02<00:00, 1250.77it/s]
100%|██████████| 2606/2606 [00:00<00:00, 3751.34it/s]


Emojis processed.



100%|██████████| 2606/2606 [00:00<00:00, 843066.43it/s]


Text encoding fixed.



100%|██████████| 2606/2606 [00:02<00:00, 1152.11it/s]


Emotions scores extracted.



100%|██████████| 2606/2606 [00:01<00:00, 2240.45it/s]


Polarity and subjectivity extracted.



100%|██████████| 2606/2606 [00:03<00:00, 755.27it/s]


VAD extracted.



100%|██████████| 2606/2606 [00:01<00:00, 1488.34it/s]


Readability scores extracted.



100%|██████████| 2606/2606 [00:00<00:00, 20306.49it/s]


Text cleaned.



100%|██████████| 2606/2606 [00:00<00:00, 649881.46it/s]


Lowercasing done.



100%|██████████| 2606/2606 [00:00<00:00, 5885.86it/s]
100%|██████████| 2606/2606 [00:00<00:00, 5774.12it/s]
100%|██████████| 2606/2606 [00:05<00:00, 467.36it/s]


Word counts retrieved.



100%|██████████| 2606/2606 [00:00<00:00, 5451.91it/s]
100%|██████████| 2606/2606 [00:00<00:00, 5727.77it/s]


Stopwords counted and removed.



100%|██████████| 2606/2606 [00:04<00:00, 636.01it/s]

Lemmatization performed.


PIPELINE APPLIED.






In [37]:
# visualization
print(f"Shape:\n{df_pres.shape}")
df_pres.head(1)

Shape:
(2606, 61)


Unnamed: 0,author,comment_id,llm_body,std_body,gender,age,openness,conscientiousness,extraversion,agreeableness,neuroticism,thread_id,node_id,parent_id,root_id,race,income,education,sex_orientation,political_leaning,religion,simulate_seed,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw
0,joylukclub,2,Since I strongly lean towards the republican s...,since strongly lean towards republican side wh...,f,21,medium,very high,very low,low,very high,1,2,1.0,1,white,low,high school,heterosexual,republican,atheist,5,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22


##### Post-processing

Missing values

In [38]:
# missing values for the processed text
print(f"Missing values for {text_col}:")
print(len(df_pres[df_pres[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_pres[df_pres[text_col].isna()]))

# checking starting text to see why they are empty (should be, OK)
df_pres[df_pres[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].iloc[:10, 1]

Missing values for std_body:
7
0


7         10
154      233
238      366
239      369
633     1040
711     1201
1930    3301
Name: comment_id, dtype: int64

In [39]:
# removing empty comments and resetting index
df_clean = df_pres[df_pres[text_col].notna()].reset_index(drop = True)
df_clean = df_clean[~df_clean[text_col].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])].reset_index(drop = True)

print(f"Size before missing values removal: {df_pres.shape}")
print(f"Size after missing values removal: {df_clean.shape}")

Size before missing values removal: (2606, 61)
Size after missing values removal: (2599, 61)


Missing emojis set (decided to keep them as empty strings)

In [40]:
# checking for empty emoji list and set
print("Missing emojis:")
print(len(df_clean[df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]))
print(len(df_clean[df_clean["emoji_unique"].isna()]))

print("\nFound emojis:")
df_clean[~df_clean["emoji_unique"].isin(["", " ", "NaN", "None", "NULL", "null", "NA"])]["emoji_unique"][:10]

Missing emojis:
2591
0

Found emojis:


499                               :thumbs_up: 
921                                         :3
1292    :white_flag: :rainbow: :glowing_star: 
1368    :white_flag: :rainbow: :glowing_star: 
1794                                        :<
2061                                        :<
2077                      :sign_of_the_horns: 
2383                                        :<
Name: emoji_unique, dtype: object

Latin alphabet

In [41]:
# apply function to the column
df_clean["is_latin"] = df_clean["llm_body"].progress_apply(is_latin)

# number of latin and non-latin comments
print(df_clean["is_latin"].value_counts())

# remove is_latin feature
df_clean.drop(columns = "is_latin", inplace = True)

100%|██████████| 2599/2599 [00:00<00:00, 11284.35it/s]

is_latin
True    2599
Name: count, dtype: int64





In [42]:
# store the dataset
df_clean.to_csv("../data/simulator/exante/after_mod/SIMULATOR_exante_pres_featextr.csv", index = False, encoding = "utf-8")