# Clean tweets


In [1]:
import datetime
import emoji
import json_lines
import numpy as np
import os
import pandas as pd
import pickle
import re
import string
import time
from tqdm import tqdm

In [2]:
# --------------------------------
# ---- DEFINE HELPER FUNCTION ----
# --------------------------------

# 1 - TEXT PROCESSING
def clean_text(text):    
    if type(text) == str:
        allchars = [str for str in text]
        emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
        clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
        clean_text = clean_text.replace('\n\r', ' ') # remove line breaks
        clean_text = re.sub(r"\bhttps://t.co/\w+", '', clean_text) # remove urls
        clean_text = re.sub("&amp", '', clean_text) # clean punctuations
        clean_text = re.sub("’", "'", clean_text)
        clean_text = re.sub("\'", "'", clean_text)        
        clean_text = re.sub("‘", "'", clean_text)
        clean_text = re.sub("”", '"', clean_text) 
        clean_text = re.sub("“", '"', clean_text) 
        clean_text = re.sub("—", '-', clean_text)
        clean_text = re.sub("&gt", '', clean_text)        
        clean_text = re.sub('"{2,}', '"', clean_text)
        clean_text = re.sub("'{2,}", "'", clean_text)
        clean_text = re.sub(" {2,}", " ", clean_text)        
        
    if type(text) != str:        
        clean_text = None
    return clean_text

# 2 - TIME PROCESSING
# covert 'created_at' column to datetime format
def time_to_date(time_str):
    new_time_dt     = []
    new_time_string = time_str.replace('+0000 ','')  
    i_time = datetime.datetime.strptime(new_time_string, '%a %b %d %H:%M:%S %Y')
    i_time_str = datetime.datetime.strftime(i_time, '%m/%d/%Y')
    new_time_dt = datetime.datetime.strptime(i_time_str, '%m/%d/%Y')
    
    return new_time_dt

# 3 - EXTRACT ATTRIBUTES FROM JSONL
def load_jsonl(file, total_line):
    tweets = []
    with open(file, 'rb') as f:
        for tweet in tqdm(json_lines.reader(f, broken=True), total = total_line):
            reduced_tweet = {"created_at" : time_to_date(tweet["created_at"]),
                             "id_str"     : tweet["id_str"],
                             "full_text"  : clean_text(tweet["full_text"]), # clean tweets
                             "retweet_count" : tweet["retweet_count"],
                             "favorite_count": tweet["favorite_count"],
                             "lang"       : tweet["lang"],                                                          
                             "user_id_str"  : tweet["user"]["id_str"],
                             "user_name"    : tweet["user"]["screen_name"],
                             "user_location": clean_text(tweet["user"]["location"]), # clean address in user profiles
                             "user_follower": tweet["user"]["followers_count"]}
            
            if tweet["place"] is not None:
                reduced_tweet["place_type"] = tweet["place"]["place_type"]
                reduced_tweet["place_name"] = tweet["place"]["full_name"]
                reduced_tweet["place_country"] = tweet["place"]["country"]
                
            if tweet["coordinates"] is not None:
                reduced_tweet["coord_lon"]  = tweet["coordinates"]["coordinates"][0]
                reduced_tweet["coord_lat"]  = tweet["coordinates"]["coordinates"][1]
                                         
            if "retweeted_status" in tweet:           
                reduced_tweet["full_text"]            = clean_text(tweet['retweeted_status']['full_text'])        
                reduced_tweet["retweeted_created_at"] = time_to_date(tweet["retweeted_status"]["created_at"])
                reduced_tweet["retweeted_id_str"]     = tweet['retweeted_status']['id_str']
                reduced_tweet["retweeted_user_id"]    = tweet["retweeted_status"]["user"]["id_str"]
                reduced_tweet["retweeted_username"]   = tweet["retweeted_status"]["user"]["screen_name"]               
                
            if "quoted_status" in tweet:                           
                reduced_tweet["quoted_created_at"] = time_to_date(tweet["quoted_status"]["created_at"])
                reduced_tweet["quoted_id_str"]     = tweet['quoted_status']['id_str']     
                reduced_tweet["quoted_user_id"]    = tweet["quoted_status"]["user"]["id_str"]
                reduced_tweet["quoted_username"]   = tweet['quoted_status']['user']['screen_name']                
                
            if tweet["in_reply_to_status_id_str"] is not None:
                reduced_tweet["reply_status"]   = tweet["in_reply_to_status_id_str"]
                reduced_tweet["reply_user_id"]  = tweet["in_reply_to_user_id_str"]
                reduced_tweet["reply_username"] = tweet["in_reply_to_screen_name"]

            tweets.append(reduced_tweet)      
        return (tweets)

In [5]:
# ---------------------------------------------------------
# ---- EXTRACT ATTRIBUTES FROM 4 LARGE RAW JSONL FILES ----
# ---------------------------------------------------------

raw_tweet_path = "D:/DL_tweets_data/0_raw_jsonl" # replace this directory with your own
reduced_tweet_path = "D:/DL_tweets_data/1_reduced_csv" # replace this directory with your own 
 
filenum = 0    
for file_name in os.listdir(raw_tweet_path):
    file_path  = raw_tweet_path + "/" + file_name
    
    
    # calculate total lines in each jsonl file
    with open(file_path, 'rb') as f:
        total_line = sum(1 for tweet in tqdm(json_lines.reader(f, broken=True)))  
        
    # use the load_jsonl function to extract attributes  
    reduced_tweet = load_jsonl(file_path, total_line)
    reduced_tweet_df = pd.DataFrame(reduced_tweet)
        
    # save python lists as csv files
    filenum += 1
    reduced_tweet_df.to_csv(reduced_tweet_path + "/" + "reduced_tweet_%d.csv" % (filenum), index= False)
    
    del reduced_tweet, reduced_tweet_df

7098819it [10:34, 11194.60it/s]
100%|██████████████████████████████████████████████████████████████████████| 7098819/7098819 [38:41<00:00, 3057.60it/s]
6944479it [11:28, 10079.95it/s]
100%|██████████████████████████████████████████████████████████████████████| 6944479/6944479 [42:03<00:00, 2751.97it/s]
6592348it [10:04, 10908.47it/s]
100%|██████████████████████████████████████████████████████████████████████| 6592348/6592348 [35:22<00:00, 3105.99it/s]
6729398it [10:12, 10994.01it/s]
100%|██████████████████████████████████████████████████████████████████████| 6729398/6729398 [36:41<00:00, 3056.49it/s]
