## Parsing Tweet Data into One Line Table

analyzing tweet connection from tweet, retweet, and replies
this notebook is intended to create social network analysis and how many users are interaction to each other

In [1]:
import os
import re
import glob
import json
import pickle
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_rows = 10

In [14]:
tweet_file_paths = glob.glob("../data/7K_INDO/*.json")

In [15]:
len(tweet_file_paths)

7003

In [16]:
def read_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [17]:
def parsing_tweet(tweet):
    """Parsing tweet
    """
    screen_name = tweet["user"]["screen_name"]
    id_tweet = tweet["id_str"]
    full_text = tweet["full_text"]
    hashtags = tweet["entities"]["hashtags"]
    user_mentions = tweet["entities"]["user_mentions"]
    created_at = tweet["created_at"]
    
    parsed = {"screen_name": screen_name, "id_tweet": id_tweet, "full_text": full_text, "hashtags": hashtags,
           "user_mentions": user_mentions, "created_at": created_at}
    
    if tweet["in_reply_to_status_id_str"]:
        parsed["in_reply_to_status_id_str"] = tweet["in_reply_to_status_id_str"]
        parsed["in_reply_to_user_id_str"] = tweet["in_reply_to_user_id_str"]
        parsed["in_reply_to_screen_name"] = tweet["in_reply_to_screen_name"]
        
    
    if tweet["is_quote_status"]:
        parsed["quote_is_quote_status"] = tweet["is_quote_status"]
        try:
            parsed["quote_screen_name"] = tweet["quoted_status"]["user"]["screen_name"]
        except:
            pass
        try:
            parsed["quote_id_tweet"] = tweet["quoted_status"]["id_str"]
        except:
            pass
        try:
            parsed["quote_full_text"] = tweet["quoted_status"]["full_text"]
        except:
            pass
        try:
            parsed["quote_hashtags"] = tweet["quoted_status"]["entities"]["hashtags"]
        except:
            pass
        try:
            parsed["quote_user_mentions"] = tweet["quoted_status"]["entities"]["user_mentions"]
        except:
            pass
        try:
            parsed["quote_created_at"] = tweet["quoted_status"]["created_at"]
        except:
            pass
    return parsed 

In [18]:
list_of_parsed_data = []
for path in tqdm(tweet_file_paths):
    data = read_json(path)
    list_of_parsed_data.extend(list(map(parsing_tweet, data["tweets"])))
    list_of_parsed_data.extend(list(map(parsing_tweet, data["retweets"])))
    list_of_parsed_data.extend(list(map(parsing_tweet, data["replies"])))

100%|██████████████████████████████████████████████████████████████████████████████| 7003/7003 [06:25<00:00, 18.16it/s]


## parsed tweet to dataframe

In [19]:
column_ordered = ['screen_name', 'id_tweet', 'full_text', 'hashtags', 'user_mentions', 'created_at',
                 'quote_is_quote_status', 'quote_screen_name', 'quote_id_tweet', 'quote_full_text',
                  'quote_hashtags', 'quote_user_mentions', 'quote_created_at', 
                  "in_reply_to_status_id_str", "in_reply_to_user_id_str", "in_reply_to_screen_name"]

In [20]:
d_tweets = pd.DataFrame(list_of_parsed_data)

In [21]:
d_tweets = d_tweets[column_ordered]

In [22]:
d_tweets.shape

(1747731, 16)

In [23]:
d_tweets.head()

Unnamed: 0,screen_name,id_tweet,full_text,hashtags,user_mentions,created_at,quote_is_quote_status,quote_screen_name,quote_id_tweet,quote_full_text,quote_hashtags,quote_user_mentions,quote_created_at,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name
0,007koteka,1277097851726643201,#HariJadiTwitterSaya \n28 Juni 2017 https://t....,"[{'text': 'HariJadiTwitterSaya', 'indices': [0...",[],Sun Jun 28 04:33:49 +0000 2020,,,,,,,,,,
1,007koteka,1275054629370261504,Penjilat yang Berkedok Agama\n\n#PecatTengkuzu...,"[{'text': 'PecatTengkuzulDariMUI', 'indices': ...",[],Mon Jun 22 13:14:47 +0000 2020,,,,,,,,,,
2,007koteka,1274709422296596480,600 Orang Ikut Rapid Test Polri Saat CFD di Ja...,[],[],Sun Jun 21 14:23:03 +0000 2020,,,,,,,,,,
3,007koteka,1274381838027382784,"Sebelum kemunculan HRS, tak ada catatan mengen...",[],"[{'screen_name': 'GusNadjb', 'name': 'GusNadjb...",Sat Jun 20 16:41:21 +0000 2020,,,,,,,,,,
4,007koteka,1274368391462051840,Selamat Ulang Tahun ke-59\nPresiden Republik I...,"[{'text': 'HUTJokowi', 'indices': [235, 245]},...","[{'screen_name': 'jokowi', 'name': 'Joko Widod...",Sat Jun 20 15:47:55 +0000 2020,,,,,,,,,,


Save file

In [24]:
with open("../data/supports/parsed_7003.pkl", "wb") as file:
    pickle.dump(d_tweets, file)