In [1]:
import pandas as pd
from os.path import join
import matplotlib.pyplot as plt
import pickle

In [None]:
src = "/data/german_newsguard_tweets/"
with open(join(src, "dtypes_config_pickle"), "rb") as file:
    DTYPES = pickle.load(file)

In [None]:
df = pd.read_csv(join(src, 
                      "german_newsguard_tweets.csv.gz"),
                 compression="gzip",
                          dtype=DTYPES)

len(df)

100000

In [14]:
df.columns

Index(['id', 'conversation_id', 'referenced_tweets.replied_to.id', 'author_id',
       'retweeted_user_id', 'impression_count', 'reply_count', 'retweet_count',
       'quote_count', 'like_count', 'author.followers_count',
       'author.following_count', 'author.listed_count', 'author.tweet_count',
       'author.verified_type', 'round', 'Rating', 'Rating_Date', 'anger',
       'fear', 'disgust', 'sadness', 'joy', 'enthusiasm', 'pride', 'hope',
       'type', 'status'],
      dtype='object')

In [None]:
# add validated orientation ratings
orientation = pd.read_csv("../inference/orientation_majority.csv",
                          usecols=["Domain", "Orientation"],
                          dtype=str)\
                    .rename(columns={"Domain": "domain"})

orientation

Unnamed: 0,domain,Orientation
0,2020news.de,Neutral
1,20min.ch,Neutral
2,aachener-nachrichten.de,Neutral
3,aachener-zeitung.de,Neutral
4,abendblatt.de,Neutral
...,...,...
319,ze.tt,Left
320,zeit.de,Left
321,zentrum-der-gesundheit.de,Neutral
322,zuerst.de,Right


In [5]:
# merge with df based on domain
df = df.merge(orientation, on="domain", how="left")

## Remove missing emotion scores

In [None]:
#remove the rows with missing values in emotions
df_na = df.dropna(subset=["anger", "fear", "disgust", "sadness",
                        "joy", "pride", "hope"])
len(df_na)

## Identify tweets containing NewsGuard domain

In [None]:
domains = pd.read_csv(join(src, 
                      "domain_tweets.csv.gz"),
                 compression="gzip",
                 usecols=["id"])
domains = set(domains["id"])

In [8]:
#create an empty column with default value "conversation"
df_na["step"] = "conversation"

In [9]:
#change value to "domain" if id is in domains set
df_na.loc[df_na["id"].isin(domains), "step"] = "domain"

In [10]:
df_na["step"].value_counts()

conversation    26054835
domain          12653563
Name: step, dtype: int64

## Keep conversations with NG starters

In [11]:
#subset only tweets that have type == starter and step == domain
df_starters = df_na[(df_na["type"] == "starter") & (df_na["step"] == "domain")]
len(df_starters)

10673952

In [12]:
# remove duplicated tweets
df_starters = df_starters.drop_duplicates(subset="id")
len(df_starters)

10579843

In [13]:
# remove tweets with missing value in Rating
df_starters = df_starters.dropna(subset=["Rating"])
len(df_starters)

9906260

In [None]:
# save
df_starters.to_csv(join(src, "discussions/discussions_starters.csv.gz"),
                     compression="gzip",
                     index=False)

In [15]:
#merge again to only keep the relevant conversations
df_merged = df_starters[["conversation_id"]]\
                    .merge(df_na, 
                            on="conversation_id",
                            how="left")
len(df_merged)

21041405

In [16]:
df_merged["status"].value_counts()

complete      19498936
incomplete     1542469
Name: status, dtype: int64

## Exclude incomplete discussions

In [17]:
#keep only complete discussions
df_complete = df_merged[df_merged["status"] == "complete"]
print(f'Removed {len(df_merged) - len(df_complete)} rows with incomplete discussions')
del df_merged

Removed 1542469 rows with incomplete discussions


In [19]:
print(f'Number of conversations: {df_complete["conversation_id"].nunique()}')
print(f'Number of tweets: {len(df_complete)}')

Number of conversations: 8624622
Number of tweets: 19498936


In [20]:
df_complete["Rating"]\
    .value_counts(normalize=True)

T    0.934525
N    0.065475
Name: Rating, dtype: float64

In [None]:
df_complete.to_csv(join(src, 
                            "discussions/discussions_complete.csv.gz"),
                            compression="gzip",
                            index=False)

## Remove discussions without responses

In [22]:
#remove columns with type == starter AND reply == 0
df_discussions = df_complete[~((df_complete["type"] == "starter") & 
                             (df_complete["reply_count"] == 0))]
print(f'Removed {len(df_complete) - len(df_discussions)} rows with no replies.')
del df_complete

Removed 7747467 rows with no replies.


In [23]:
print(f'Number of conversations: {df_discussions["conversation_id"].nunique()}')
print(f'Number of tweets: {len(df_discussions)}')

Number of conversations: 914066
Number of tweets: 11751469


In [24]:
df_discussions["type"].value_counts()

reply      10831242
starter      920227
Name: type, dtype: int64

In [25]:
df_discussions["Rating"]\
    .value_counts(normalize=True)

T    0.924594
N    0.075406
Name: Rating, dtype: float64

In [None]:
#save as csv in discussions directory
df_discussions.to_csv(join(src, 
                            "discussions/discussions_replies.csv.gz"),
                            compression="gzip",
                            index=False)