## Data Preprocessing for Network Analysis

In [1]:
import re
import json
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from itertools import chain
from collections import Counter

In [2]:
tqdm.pandas()

In [3]:
pd.options.display.max_rows = 10

In [4]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [6]:
d_199 = read_pickle("../data/supports/parsed_199.pkl")
d_6725 = read_pickle("../data/supports/parsed_7003.pkl")

In [None]:
## ada 43 screen_names yang overlap
d_6725[d_6725.screen_name.isin(d_199.screen_name.unique())].screen_name.unique()

In [None]:
d_tweets = pd.concat([d_199, d_6725], axis = 0, sort=False)

In [None]:
## remove duplicates
d_tweets.drop_duplicates(subset=["id_tweet"], keep='first', inplace=True)

In [None]:
d_tweets.reset_index(drop=True, inplace=True)

In [None]:
d_tweets.columns

In [None]:
column_ordered = ['screen_name', 'id_tweet', 'full_text', 'hashtags', 'user_mentions', 'created_at',
                 'quote_is_quote_status', 'quote_screen_name', 'quote_id_tweet', 'quote_full_text',
                  'quote_hashtags', 'quote_user_mentions', 'quote_created_at', 
                  "in_reply_to_status_id_str", "in_reply_to_user_id_str", "in_reply_to_screen_name"]

In [None]:
d_tweets = d_tweets[column_ordered]

In [None]:
d_tweets

In [None]:
## should change 1 to 0
d_user_mention_quote = d_tweets[(d_tweets.user_mentions.apply(len) > 0) | (d_tweets.quote_screen_name)]

In [None]:
def interaction(row):
    screen_name = row.screen_name
    user_mentions = row.user_mentions
    quote_screen_name = row.quote_screen_name
    user_interaction = []
    for username in user_mentions:
        mentioned_screen_name = username["screen_name"]
        user_interaction.append(screen_name + " - " + mentioned_screen_name)
        
    if isinstance(quote_screen_name, str):
        user_interaction.append(screen_name + " - " + quote_screen_name)
        
    return user_interaction

In [None]:
d_user_mention_quote.shape

In [None]:
d_user_mention_quote

In [None]:
d_user_mention_quote.loc[:, "interaction_list"] = d_user_mention_quote.progress_apply(interaction, axis = 1)

In [None]:
list_of_interactions = d_user_mention_quote.loc[:, "interaction_list"].to_list()

In [None]:
interactions_count = Counter(list(chain(*list_of_interactions)))

In [None]:
d_connection = pd.DataFrame(data = {"connection": list(interactions_count.keys()), 
                                    "frequency": list(interactions_count.values())})

In [None]:
d_connection.shape

In [None]:
d_connection.head()

In [None]:
d_connection.to_csv("../data/supports/connection.csv", index=False)

## Preparing data for tweet network

In [None]:
d_tweet_connection = d_tweets[(~d_tweets.in_reply_to_status_id_str.isna()) | (~d_tweets.quote_id_tweet.isna())]

In [None]:
def tweet_interaction(df):
    id_tweet = df.id_tweet
    in_reply_to_status_id_str = df.in_reply_to_status_id_str
    quote_id_tweet = df.quote_id_tweet
    
    connection = []
    
    if isinstance(in_reply_to_status_id_str, str):
        connection.append({"id_tweets": (id_tweet, in_reply_to_status_id_str), 
                           "screen_name": (df.screen_name, df.in_reply_to_screen_name),
                           "type": "in_reply"})
    
    if isinstance(quote_id_tweet, str):
        connection.append({"id_tweets": (id_tweet, quote_id_tweet),
                           "screen_name": (df.screen_name, df.quote_screen_name),
                           "type": "in_quote"})
        
    return connection

In [None]:
d_tweet_connection.loc[:, "interaction_list"] = d_tweet_connection.progress_apply(tweet_interaction, axis = 1)

In [None]:
id_tweet_interaction = list(chain(*d_tweet_connection.interaction_list.values.tolist()))

In [None]:
with open("../data/supports/id_tweet_connection.json", 'w') as f:
    json.dump(id_tweet_interaction, f)