In [4]:
import pandas as pd
import os
from os.path import join
import pickle as pkl
import numpy as np
import shutil
np.random.seed(636) 

In [5]:
src = '/raid5pool/tank/luehring/german_newsguard_tweets/discussions/'
dst = '../data/'

if not os.path.exists(dst):
    os.makedirs(dst)

In [6]:
with open(join(src, "../dtypes_config.pickle"), "rb") as file:
    DTYPES = pkl.load(file)
    
with open(join(dst, "dtypes_config.pickle"), "wb") as file:
    pkl.dump(DTYPES, file)

In [7]:
authors = pd.read_csv(join(src, 
                      "../german_newsguard_tweets_inference.csv.gz"),
                 usecols=["author_id"],
                 compression="gzip",
                 dtype=DTYPES)

In [8]:
# extract unique author ids
unique_ids = authors['author_id'].unique()

# generate a random number for each unique author_id
random_ids = np.random.randint(1e6, 1e7, size=len(unique_ids))  

# create a mapping
id_map = dict(zip(unique_ids, random_ids))

In [9]:
replies = pd.read_csv(join(src, 'discussions_replies.csv.gz'),
                          dtype=DTYPES)\
                          .drop(columns=[
                                'domain',
                                'Score',
                                'enthusiasm',
                                'out'])
replies.columns

Index(['conversation_id', 'id', 'author_id', 'created_at', 'text',
       'reply_count', 'retweet_count', 'quote_count', 'like_count',
       'author.followers_count', 'author.following_count',
       'author.tweet_count', 'Rating', 'anger', 'fear', 'disgust', 'sadness',
       'joy', 'pride', 'hope', 'type', 'status', 'Orientation', 'step'],
      dtype='object')

In [10]:
replies['author_id'] = replies['author_id'].map(id_map)
replies.to_csv(join(dst, 'discussions_replies.csv.gz'), 
               compression='gzip',
               index=False)

In [11]:
replies.head(5)

Unnamed: 0,conversation_id,id,author_id,created_at,text,reply_count,retweet_count,quote_count,like_count,author.followers_count,...,fear,disgust,sadness,joy,pride,hope,type,status,Orientation,step
0,1329656998665875456,1329656998665875456,9886687,2020-11-20 05:25:06+00:00,Die Senatsverwaltung für Gesundheit (Berlin)ha...,1.0,0.0,0.0,0.0,2482.0,...,0.000651,0.000588,0.000619,0.001609,0.000714,0.001604,starter,complete,Neutral,domain
1,1314839315390791680,1315345060393820160,4020666,2020-10-11 17:34:34+00:00,@JohannkuPeter Heute gab es noch gar kein Wied...,0.0,0.0,0.0,0.0,28.0,...,0.000524,0.000639,0.000698,0.001513,0.000732,0.000496,reply,complete,,conversation
2,1314839315390791680,1314839315390791680,2992778,2020-10-10 08:04:55+00:00,"Diese Kinder erleben die Hölle auf Erden, dami...",2.0,0.0,0.0,0.0,347.0,...,0.003225,0.148451,0.066498,0.002586,0.000809,0.000494,starter,complete,Neutral,domain
3,1344383527215894528,1344393284244955144,4351154,2020-12-30 21:21:51+00:00,@mariomayl @FrauNicoleHB Fortsetzung folgt???\...,0.0,0.0,0.0,1.0,335.0,...,0.002064,0.000634,0.00056,0.000798,0.000559,0.000647,reply,complete,Neutral,domain
4,1344383527215894528,1344900437028327425,4757373,2021-01-01 06:57:05+00:00,@mariomayl Von den armen Kindern die unbegleit...,0.0,0.0,0.0,0.0,379.0,...,0.010429,0.05009,0.924006,0.006543,0.003962,0.004546,reply,complete,,conversation


In [12]:
replies.columns

Index(['conversation_id', 'id', 'author_id', 'created_at', 'text',
       'reply_count', 'retweet_count', 'quote_count', 'like_count',
       'author.followers_count', 'author.following_count',
       'author.tweet_count', 'Rating', 'anger', 'fear', 'disgust', 'sadness',
       'joy', 'pride', 'hope', 'type', 'status', 'Orientation', 'step'],
      dtype='object')

In [13]:
replies_matched = pd.read_csv(join(src, 'matched_replies_mahalanobis_log_bias.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                                        'Unnamed: 0', 
                                        'domain', 
                                        'Score'])
replies_matched.columns

Index(['conversation_id', 'author_id', 'created_at', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'anger_avg', 'fear_avg', 'disgust_avg', 'sadness_avg',
       'joy_avg', 'pride_avg', 'hope_avg', 'author.tweet_count_avg',
       'author.tweet_count_avg_log', 'time_diff', 'time_diff_log', 'weights',
       'subclass'],
      dtype='object')

In [14]:
replies_matched['author_id'] = replies_matched['author_id'].map(id_map)

replies_matched.to_csv(join(dst, 'matched_replies_mahalanobis.csv'), index=False)

In [15]:
first_replies = pd.read_csv(join(src, 
                                 'matched_replies_first_mahalanobis_log_bias.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                              'Unnamed: 0',
                              'domain', 'Score'])    
first_replies.columns

Index(['conversation_id', 'author_id', 'created_at', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'id', 'first_author_id', 'anger_first', 'fear_first',
       'disgust_first', 'sadness_first', 'joy_first', 'pride_first',
       'hope_first', 'author.tweet_count_first',
       'author.tweet_count_first_log', 'weights', 'subclass'],
      dtype='object')

In [None]:
first_replies['author_id'] = first_replies['author_id'].map(id_map)

first_replies.to_csv(join(dst, 'matched_first_replies_mahalanobis.csv'), index=False)

In [17]:
starters = pd.read_csv(join(src, 'discussions_starters.csv.gz'),
                          dtype=DTYPES)\
                          .drop(columns=[
                              'domain', 'text',
                              'enthusiasm', 'out',
                              'Score'])    
starters.columns

Index(['id', 'conversation_id', 'author_id', 'created_at', 'reply_count',
       'retweet_count', 'quote_count', 'like_count', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'type', 'status',
       'Orientation', 'step'],
      dtype='object')

In [18]:
starters['author_id'] = starters['author_id'].map(id_map)
starters.to_csv(join(dst, 'discussions_starters.csv.gz'), 
                compression='gzip',
                index=False)

In [19]:
matched_starters = pd.read_csv(join(src, 
                            'matched_starters_mahalanobis_log_bias.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                                        'Unnamed: 0', 
                                        'domain', 
                                        'Score'])

matched_starters.columns

Index(['id', 'author_id', 'created_at', 'reply_count', 'retweet_count',
       'quote_count', 'like_count', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'weights', 'subclass'],
      dtype='object')

In [20]:
matched_starters['author_id'] = matched_starters['author_id'].map(id_map)

starters.to_csv(join(dst, 'matched_starters_mahalanobis.csv'), index=False)

In [21]:
same_replies = pd.read_csv(
    join(src, "same_replies_aggregates_one.csv"), dtype=DTYPES)\
    .drop(columns=['domain', 'Score'])
same_replies.columns

Index(['conversation_id', 'same_author_id', 'created_at',
       'author.followers_count', 'author.following_count',
       'same_author.tweet_count', 'Rating', 'anger', 'fear', 'disgust',
       'sadness', 'joy', 'pride', 'hope', 'Orientation', 'word_count',
       'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'same_id', 'first_author_id', 'anger_first', 'fear_first',
       'disgust_first', 'sadness_first', 'joy_first', 'pride_first',
       'hope_first', 'author.tweet_count_first',
       'author.tweet_count_first_log'],
      dtype='object')

In [22]:
same_replies['same_author_id'] = same_replies['same_author_id'].map(id_map)
same_replies.to_csv(join(dst, 'same_author_replies.csv'), index=False)

In [None]:
# zip the data/ directory as data.zip
shutil.make_archive('./data', 'zip', '.') 

'/home/luehring/emomis-discussion-analysis/data_processing/data.zip'