<a href="https://colab.research.google.com/github/ksnugroho/feel-in/blob/main/twitter-scraper/03_tweet_processor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Tweet Processor**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Path to google drive folder
%cd /content/drive/MyDrive/

In [None]:
# https://pypi.org/project/tweet-preprocessor/
!pip install -q tweet-preprocessor

In [None]:
import glob
import os
import pandas as pd
import preprocessor as p
from tqdm import tqdm
tqdm.pandas()

In [None]:
def preprocess_tweet(row):
    p.set_options(p.OPT.MENTION, p.OPT.RESERVED, p.OPT.HASHTAG, p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
    text = row['tweet']
    text = p.clean(text)
    return text

In [None]:
%%time

source_folder_path = 'twitter-scraper/data-merge'
destination_folder_path = 'twitter-scraper/data-clean'

emotion_list = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

for emotion in emotion_list:
    df = pd.read_csv(f'{source_folder_path}/{emotion}.csv', usecols=['tweet'], dtype={'tweet':'str'})
    df = df.drop_duplicates()   # dropping duplicates
    df = df.dropna()            # dropping rows with NaN values
    df['tweet'] = df.progress_apply(preprocess_tweet, axis=1)
    df.to_csv(f'{destination_folder_path}/{emotion}.csv')

print('Done!')

100%|██████████| 98795/98795 [00:13<00:00, 7267.50it/s]
100%|██████████| 98780/98780 [00:09<00:00, 10181.92it/s]
100%|██████████| 98839/98839 [00:08<00:00, 11072.90it/s]
100%|██████████| 98961/98961 [00:09<00:00, 10990.41it/s]
100%|██████████| 99321/99321 [00:09<00:00, 11007.88it/s]
100%|██████████| 98823/98823 [00:08<00:00, 11005.87it/s]


Done!
CPU times: user 1min 4s, sys: 825 ms, total: 1min 5s
Wall time: 1min 17s


In [None]:
%%time
df_1 = pd.read_csv('twitter-scraper/data-raw/1000000-2020-01-01-2020-12-31.csv', usecols=['tweet'], dtype={'tweet':'str'}, lineterminator='\n')
df_1 = df_1.drop_duplicates()   # dropping duplicates
df_1 = df_1.dropna()            # dropping rows with NaN values
df_1['tweet'] = df_1.progress_apply(preprocess_tweet, axis=1)     # clean tweet

df_1.to_csv('twitter-scraper/data-clean/1000000-2020-01-01-2020-12-31-clean.csv')
print('Done!')

100%|██████████| 986920/986920 [01:30<00:00, 10907.26it/s]


Done!
CPU times: user 1min 33s, sys: 1.03 s, total: 1min 34s
Wall time: 1min 37s


In [None]:
df_1.shape

(986920, 1)

In [None]:
%%time
df_2 = pd.read_csv('twitter-scraper/data-raw/2500000-2019-01-01-2019-12-31.csv', usecols=['tweet'], dtype={'tweet':'str'}, lineterminator='\n')
df_2 = df_2.drop_duplicates()   # dropping duplicates
df_2 = df_2.dropna()            # dropping rows with NaN values
df_2['tweet'] = df_2.progress_apply(preprocess_tweet, axis=1)     # clean tweet

df_2.to_csv('twitter-scraper/data-clean/2500000-2019-01-01-2019-12-31.csv-clean.csv')
print('Done!')

100%|██████████| 2341534/2341534 [03:37<00:00, 10765.78it/s]


Done!
CPU times: user 3min 49s, sys: 4.2 s, total: 3min 54s
Wall time: 4min 1s


In [None]:
df_2.shape

(2341534, 1)

In [None]:
# find all files in data-clean fodler
files = os.path.join(destination_folder_path, '*.csv')
files = glob.glob(files)
files

['twitter-scraper/data-clean/anger.csv',
 'twitter-scraper/data-clean/disgust.csv',
 'twitter-scraper/data-clean/fear.csv',
 'twitter-scraper/data-clean/joy.csv',
 'twitter-scraper/data-clean/sadness.csv',
 'twitter-scraper/data-clean/surprise.csv',
 'twitter-scraper/data-clean/1000000-2020-01-01-2020-12-31-clean.csv',
 'twitter-scraper/data-clean/2500000-2019-01-01-2019-12-31.csv-clean.csv']

In [None]:
# merge all files
df_final = pd.concat(map(pd.read_csv, files), ignore_index=True)
df_final = df_final[['tweet']]
df_final = df_final.drop_duplicates()   # dropping duplicates
df_final = df_final.dropna()            # dropping rows with NaN values
df_final.shape

(3126987, 1)

In [None]:
df_final.to_csv('data/id-tweet-dump/id-tweet-dump-clean.csv')