# Preprocess Data

In [136]:
import pandas as pd
import json
import functools
import seaborn as sns
import datetime
import math
import numpy as np
import preprocessor as p

In [2]:
data : pd.DataFrame = pd.read_json("./project_data/train.data.jsonl", lines=True)

In [87]:
def fn_extractTxt(series: pd.Series):
    texts = []
    for dictionary in series:
        if dictionary is not None:
            texts.append(dictionary.get("text")
        else:
            texts.append(None)
    return pd.Series(texts)


def fn_sortTweetsChronological(series: pd.Series):
    time_created_list = []
    series_len = len(series)

    for dictionary in series:
        if dictionary is not None:
            time_created_str = dictionary.get("created_at")
            time_created = datetime.datetime.strftime(datetime.datetime.strptime(time_created_str,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
            time_created_list.append(time_created)
    
    ind_time_created_list_sorted = sorted(range(len(time_created_list)), key=lambda k: time_created_list[k])
    new_series = series[ind_time_created_list_sorted]
    return new_series


def fn_concatTweet(series: pd.Series):
    filtered_series = filter(None,series)
    concatTweet = ''.join(filtered_series)
    return concatTweet


In [71]:
data.iloc[0,0]

{'contributors': None,
 'truncated': False,
 'text': 'How to respond to the murderous attack on Charlie Hebdo? Every newspaper in the free world should print this. http://t.co/sC2ot63F6j',
 'in_reply_to_status_id': None,
 'id': 552800070199148544,
 'favorite_count': 77,
 'source': '<a href="http://www.apple.com" rel="nofollow">iOS</a>',
 'retweeted': False,
 'coordinates': None,
 'entities': {'symbols': [],
  'user_mentions': [],
  'hashtags': [],
  'urls': [],
  'media': [{'expanded_url': 'http://twitter.com/Heresy_Corner/status/552800070199148544/photo/1',
    'display_url': 'pic.twitter.com/sC2ot63F6j',
    'url': 'http://t.co/sC2ot63F6j',
    'media_url_https': 'https://pbs.twimg.com/media/B6vwvCVIQAASBJx.jpg',
    'id_str': '552800070153027584',
    'sizes': {'small': {'h': 408, 'resize': 'fit', 'w': 340},
     'large': {'h': 472, 'resize': 'fit', 'w': 393},
     'medium': {'h': 472, 'resize': 'fit', 'w': 393},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'indices': 

In [134]:
data_sorted = data.apply(fn_sortTweetsChronological, axis=1)
data_sorted = data_sorted.where(pd.notnull(data_sorted), None)
sourceIds = list(map(lambda x: x.get("id_str"), data_sorted.iloc[:,0]))

In [88]:
text_df = data_sorted.apply(fn_extractTxt, axis=1)

In [91]:
text_df_concatenate = pd.DataFrame(text_df.apply(fn_concatTweet, axis=1), columns=["text"])
text_df_concatenate["length"] = text_df_concatenate.text.str.len()

In [92]:
text_df_concatenate.describe()

Unnamed: 0,length
count,4641.0
mean,1677.63844
std,2118.619686
min,45.0
25%,600.0
50%,1255.0
75%,1985.0
max,38058.0


In [112]:
def clean_thread(series: pd.Series):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.MENTION)
    cleaned_thread_list = []
    for thread in series:

        # Clean URL, EMOJI, SMILEY, NUMBER, MENTION
        cleaned_thread_first = p.clean(thread)
        cleaned_thread_list.append(cleaned_thread_first)
    
    return pd.Series(cleaned_thread_list)


def truncate(series: pd.Series, max_sequence_length=1024):
    truncated_thread_list = []
    for thread in series:
        if len(thread) > max_sequence_length:
            truncated_thread_list.append(thread[:int(max_sequence_length/2)] + thread[:-int(max_sequence_length/2)])
        else:
            truncated_thread_list.append(thread)
    
    return truncated_thread_list

In [113]:
text_df_concatenate_cleaned = pd.DataFrame(clean_thread(text_df_concatenate.text), columns=["text"])

In [114]:
text_concatenate_cleaned_final = truncate(text_df_concatenate_cleaned.text)

In [131]:
with open("./project_data/train.label.json") as f:
    labels = json.load(f)

In [137]:
corresponding_labels = [labels[id] for id in sourceIds]



In [147]:
final_data = np.array([text_concatenate_cleaned_final,corresponding_labels]).T
final_data.shape

(4641, 2)