In [1]:
import pandas as pd
import os
from os.path import join
import pickle as pkl
import numpy as np
import shutil
np.random.seed(636) 

In [2]:
src = '/raid5pool/tank/luehring/german_newsguard_tweets/discussions/'
dst = '../data/'

if not os.path.exists(dst):
    os.makedirs(dst)

In [3]:
with open(join(src, "../dtypes_config.pickle"), "rb") as file:
    DTYPES = pkl.load(file)
    
with open(join(dst, "dtypes_config.pickle"), "wb") as file:
    pkl.dump(DTYPES, file)

In [4]:
authors = pd.read_csv(join(src, 
                      "../german_newsguard_tweets_inference.csv.gz"),
                 usecols=["author_id"],
                 compression="gzip",
                 dtype=DTYPES)

In [5]:
# extract unique author ids
unique_ids = authors['author_id'].unique()

# generate a random number for each unique author_id
random_ids = np.random.randint(1e6, 1e7, size=len(unique_ids))  

# create a mapping
id_map = dict(zip(unique_ids, random_ids))

In [6]:
replies = pd.read_csv(join(src, 'discussions_replies.csv.gz'),
                          dtype=DTYPES)\
                          .drop(columns=[
                                'domain',
                                'Score',
                                'text',
                                'enthusiasm',
                                'out'])
replies.columns

Index(['conversation_id', 'id', 'author_id', 'created_at', 'reply_count',
       'retweet_count', 'quote_count', 'like_count', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'type', 'status',
       'Orientation', 'step'],
      dtype='object')

In [7]:
replies['author_id'] = replies['author_id'].map(id_map)
replies.to_csv(join(dst, 'discussions_replies.csv.gz'), 
               compression='gzip',
               index=False)

In [8]:
replies_agg = pd.read_csv(join(src, 'discussions_replies_aggregates.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                                'domain',
                                'Score'])
replies_agg.columns

Index(['conversation_id', 'author_id', 'created_at', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'anger_avg', 'fear_avg', 'disgust_avg', 'sadness_avg',
       'joy_avg', 'pride_avg', 'hope_avg', 'author.tweet_count_avg',
       'author.tweet_count_avg_log', 'time_diff', 'time_diff_log'],
      dtype='object')

In [9]:
replies_agg['author_id'] = replies_agg['author_id'].map(id_map)
replies_agg.to_csv(join(dst, 'discussions_replies_aggregates.csv'), 
               index=False)

In [10]:
replies_matched = pd.read_csv(join(src, 'matched_replies_mahalanobis_log_bias.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                                        'Unnamed: 0', 
                                        'domain', 
                                        'Score'])
replies_matched.columns

Index(['conversation_id', 'author_id', 'created_at', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'anger_avg', 'fear_avg', 'disgust_avg', 'sadness_avg',
       'joy_avg', 'pride_avg', 'hope_avg', 'author.tweet_count_avg',
       'author.tweet_count_avg_log', 'time_diff', 'time_diff_log', 'weights',
       'subclass'],
      dtype='object')

In [11]:
replies_matched['author_id'] = replies_matched['author_id'].map(id_map)

replies_matched.to_csv(join(dst, 'matched_replies_mahalanobis.csv'), index=False)

In [12]:
first_replies = pd.read_csv(join(src, 
                                 'matched_replies_first_mahalanobis_log_bias.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                              'Unnamed: 0',
                              'domain', 'Score'])    
first_replies.columns

Index(['conversation_id', 'author_id', 'created_at', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'id', 'first_author_id', 'anger_first', 'fear_first',
       'disgust_first', 'sadness_first', 'joy_first', 'pride_first',
       'hope_first', 'author.tweet_count_first',
       'author.tweet_count_first_log', 'weights', 'subclass'],
      dtype='object')

In [13]:
first_replies['author_id'] = first_replies['author_id'].map(id_map)
first_replies['first_author_id'] = first_replies['first_author_id'].map(id_map)

first_replies.to_csv(join(dst, 'matched_first_replies_mahalanobis.csv'), index=False)

In [14]:
starters = pd.read_csv(join(src, 'discussions_starters.csv.gz'),
                          dtype=DTYPES)\
                          .drop(columns=[
                              'domain', 'text',
                              'enthusiasm', 'out',
                              'Score'])    
starters.columns

Index(['id', 'conversation_id', 'author_id', 'created_at', 'reply_count',
       'retweet_count', 'quote_count', 'like_count', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'type', 'status',
       'Orientation', 'step'],
      dtype='object')

In [15]:
starters['author_id'] = starters['author_id'].map(id_map)
starters.to_csv(join(dst, 'discussions_starters.csv.gz'), 
                compression='gzip',
                index=False)

In [16]:
starters_agg = pd.read_csv(join(src, 'discussions_starters_aggregates.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                              'domain', 
                              'Score'])    
starters_agg.columns

Index(['id', 'author_id', 'created_at', 'reply_count', 'retweet_count',
       'quote_count', 'like_count', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log'],
      dtype='object')

In [17]:
starters_agg['author_id'] = starters_agg['author_id'].map(id_map)
starters.to_csv(join(dst, 'discussions_starters_aggregates.csv'), 
                index=False)

In [18]:
matched_starters = pd.read_csv(join(src, 
                            'matched_starters_mahalanobis_log_bias.csv'),
                          dtype=DTYPES)\
                          .drop(columns=[
                                        'Unnamed: 0', 
                                        'domain', 
                                        'Score'])

matched_starters.columns

Index(['id', 'author_id', 'created_at', 'reply_count', 'retweet_count',
       'quote_count', 'like_count', 'author.followers_count',
       'author.following_count', 'author.tweet_count', 'Rating', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'pride', 'hope', 'Orientation',
       'word_count', 'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'weights', 'subclass'],
      dtype='object')

In [19]:
matched_starters['author_id'] = matched_starters['author_id'].map(id_map)

starters.to_csv(join(dst, 'matched_starters_mahalanobis.csv'), index=False)

In [20]:
same_replies = pd.read_csv(
    join(src, "same_replies_aggregates_one.csv"), dtype=DTYPES)\
    .drop(columns=['domain', 'Score'])
same_replies.columns

Index(['conversation_id', 'same_author_id', 'created_at',
       'author.followers_count', 'author.following_count',
       'same_author.tweet_count', 'Rating', 'anger', 'fear', 'disgust',
       'sadness', 'joy', 'pride', 'hope', 'Orientation', 'word_count',
       'word_count_log', 'author.tweet_count_log',
       'author.followers_count_log', 'author.following_count_log', 'anger_log',
       'fear_log', 'disgust_log', 'sadness_log', 'joy_log', 'pride_log',
       'hope_log', 'same_id', 'first_author_id', 'anger_first', 'fear_first',
       'disgust_first', 'sadness_first', 'joy_first', 'pride_first',
       'hope_first', 'author.tweet_count_first',
       'author.tweet_count_first_log'],
      dtype='object')

In [21]:
same_replies['same_author_id'] = same_replies['same_author_id'].map(id_map)
same_replies.to_csv(join(dst, 'same_author_replies.csv'), index=False)

In [23]:
# zip the data/ directory as data.zip
shutil.make_archive('../data', 'zip', '../') 

'/home/luehring/emomis-discussion-analysis/data.zip'