Notes for Jacopo:


<br>
This script loads labeled data from a Parquet file, filters threads based on the number of comments, and calculates percentiles for comment creation times within each thread.<br>


In [None]:
import sys
module_path = '/Users/jacoponudo/Documents/thesis/src/PRO'
sys.path.append(module_path)
from PRO_package.functions import *
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import os

In [None]:
social_media_name = "youtube"
thread_identifier = "root_submission"
root = '/Users/jacoponudo/Documents/thesis/'

In [None]:
input_filename = os.path.join(root, "data", social_media_name, f"{social_media_name}_labeled_data_unified.parquet")
data = pd.read_parquet(input_filename)

## Stage 1 - Add variables ####


<br>
Essential to add some useful variables before to start working. This is the variables list:<br>
    <br>
* thread_lifetime <br>
* thread_birth<br>
* temporal_distance_birth<br>
* number_of_comments<br>
* number_of_users<br>
* number_of_comments_user_in_thread<br>
* sequence_number_comment_user_thread<br>
* inter arrival time user<br>
* inter arrival time user-thread<br>
* inter arrival time user-thread<br>


Filter mantaining just threads with more than 50 comments

In [None]:
comment_counts = data[thread_identifier].value_counts()
threads_with_more_than_10_comments = comment_counts[comment_counts > 50].index
data = data[data[thread_identifier].isin(threads_with_more_than_10_comments)]

In [None]:
threads = data.groupby(thread_identifier)  # Divide in threads

In [None]:
progress_bar = tqdm(total=len(threads), desc="Add variables...")
for name, group in threads:
    group['time'] = pd.to_datetime(group['created_at'])
    group.sort_values(by='created_at', inplace=True)
    group['thread_birth'] = group['time'].min()
    group['temporal_distance_birth_h'] = round(pd.to_timedelta(group['time']-group['thread_birth']).dt.total_seconds() / 3600, 1)
    group['thread_lifetime_h'] = round((group['time'].max()-group['time'].min()).total_seconds() / 3600, 1)
    group['number_of_comments'] = len(group)
    group['number_of_users'] = len(group.user.unique())
    group['unique_words_count'] = group['text'].apply(count_unique_words)
    
    percentiles = add_percentile_column(group)['percentile']
    
    user_counts = group.groupby('user')['user'].count()
    group['number_of_comments_by_user_in_thread'] = group['user'].map(user_counts)
    group['user_appearances'] = group.groupby('user').cumcount() + 1
    
    data.loc[group.index, 'percentile'] = percentiles
    data.loc[group.index,'sequential_number_of_comment_by_user_in_thread'] = group['user_appearances']
    data.loc[group.index, 'number_of_comments_by_user_in_thread'] = group['number_of_comments_by_user_in_thread']
    data.loc[group.index, 'thread_birth'] = group['thread_birth']
    data.loc[group.index,'temporal_distance_birth_h'] = group['temporal_distance_birth_h']
    data.loc[group.index, 'thread_lifetime_h'] = group['thread_lifetime_h']
    data.loc[group.index, 'number_of_users'] = group['number_of_users']  
    data.loc[group.index, 'unique_words_count'] = group['unique_words_count']
    data.loc[group.index, 'number_of_comments'] = group['number_of_comments']
    progress_bar.update(1)
    
progress_bar.close()

In [None]:
data['language'] = data['text'].apply(detect_language)

In [None]:
unique_word_ratios,total_words = calculate_unique_word_ratio(data)
data['unique_word_user'] = data['user'].map(unique_word_ratios)

In [None]:
data = data.sort_values(by=['user', 'created_at'])
data['created_at'] = pd.to_datetime(data['created_at'])
data['temporal_distance_from_previous_comment_h'] = data.groupby('user')['created_at'].diff().dt.total_seconds() / 3600

In [None]:
data['root_submission']=data['root_submission'].astype('int')

In [None]:
data['created_at'] = pd.to_datetime(data['created_at'])

In [None]:
data = data.sort_values(by=['user', 'created_at'])
data['IAT_user'] = data.groupby('user')['created_at'].diff().dt.total_seconds()

In [None]:
data = data.sort_values(by=['user','root_submission', 'created_at'])
data['IAT_user_thread'] = data.groupby(['user','root_submission'])['created_at'].diff().dt.total_seconds()

In [None]:
data = data.sort_values(by=['root_submission', 'created_at'])
data['IAT_thread'] = data.groupby('root_submission')['created_at'].diff().dt.total_seconds()

In [None]:
data=data[['comment_id', 'text', 'video_id', 'user', 'upvotes',
       'downvotes', 'depth', 'root_submission', 'topic', 'toxicity_score',
       'created_at', 'social', 'percentile',
       'sequential_number_of_comment_by_user_in_thread',
       'number_of_comments_by_user_in_thread', 'thread_birth',
       'temporal_distance_birth_h', 'thread_lifetime_h', 'number_of_users',
       'unique_words_count', 'number_of_comments', 'language',
       'unique_word_user', 'temporal_distance_from_previous_comment_h',
       'IAT_user', 'IAT_user_thread', 'IAT_thread']]

In [None]:
data.to_csv(root+'src/PRO/output/'+social_media_name+'_processed.csv', index=False)