In [20]:
import pandas as pd
ID_PATH   = "./task2/data/covid.data.txt"
DATA_PATH = "./task2/data/full_covid.json"

In [25]:
def convert_replies_id_to_sorted_text(list_of_id, data_df):
    """Convert a list of tweet ids to a list of its corresponding text in data_df
    Sort by tweets original create time"""
    if type(list_of_id) is not list:
        # if input is a single id convert type
        list_of_id = [list_of_id]
    # Select dataframe based on a list of ids
    selected_df = data_df[data_df['id'].astype(str).isin(list_of_id)]
    # Sort by their created time
    sorted_df = selected_df.sort_values(by = ["created_at"])
    # Select text fields only
    text_list = sorted_df["text"].to_list()
    return text_list


def remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

def join_data_id_label_v2(LABEL_PATH, ID_PATH, DATA_PATH):
    # Process Labels
    if LABEL_PATH:
        # 1: Rumour
        # 0: NonRumour
        with open(LABEL_PATH, "r") as f:
            y_label = f.read().strip().split("\n") # remove next line
        y_label = pd.DataFrame(y_label, columns = ["label"])
        y_label[y_label["label"]=="rumour"] = 1
        y_label[y_label["label"]=="nonrumour"] = 0

    ## Get Dataframe Id, with first id as source Id, and values as replies, not using dict since we have duplicated keys
    total_id_list = []
    with open(ID_PATH, "r") as f:
        for line in f:
            line = line.strip().split(',') # remove next line
            source_id = line[0]
            if len(line) > 1:
                # if we have replies id
                replies_id = line[1:]
            else:
                replies_id = []
            row = [source_id, replies_id]
            total_id_list.append(row)
    len(total_id_list)
    
    ## Create a dataframe containing a list of replies
    source_df = pd.DataFrame(total_id_list, columns = ['source_id', 'replies_id'])
    data_df = pd.read_json(DATA_PATH)
    source_df["reply_text_list"] = source_df["replies_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)
    source_df["source_text"] = source_df["source_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)

    if LABEL_PATH:
      source_df["label"] = y_label
    return source_df

def preprocess(IDS, DATA, LABELS=False):
    """Function to combine all the preprocessing steps"""
    data = join_data_id_label_v2(LABELS, IDS, DATA)
    ## 1. Only keep english tweets as most of them are in english
    # Use only text data and remove URLs
    #data["source"] = data["source_text"].apply(" ".join).apply(p.clean) 
    #data["replies"] = data["reply_text_list"].apply(" ".join).apply(p.clean)
    data["source"] = data["source_text"].apply(" ".join)
    data["replies"] = data["reply_text_list"].apply(" ".join)
    
    return data

In [26]:
## Remove Stopwords
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

def text_processing(input_str):
    """A function takes a input text string and output a list of preprocessed words
        1. Remove URL
        2. StopWords Removal
        3. Remove Non-English Words
        4. Remove Numeric Value
        5. Lemmatize
    
    """
    list_words = []
    # Remove URL
    input_str = Remove_URL(input_str)
    for word in re.sub(r'[^\w\s]','', input_str).split():
        # Remove Numeric Value, Non-English words and Stop words
        if (word.isalpha()) and (isEnglish(word)) and (word not in stopwords.words('english')):
            # Remove Links in the text
            # Lemmatize words
            word = lemmatizer.lemmatize(word)
            # Add to List
            list_words.append(word)
    return list_words

In [27]:
data = preprocess(ID_PATH, DATA_PATH)

In [28]:
data.head() 

Unnamed: 0,source_id,replies_id,reply_text_list,source_text,source,replies
0,1272262651100434433,[],[],"[According to the New York Times, Warner Bros....","According to the New York Times, Warner Bros. ...",
1,1287153210990395392,[1287191952115605505],[@TexasTribune Guess what the cause of death i...,[Hurricane Hanna has made landfall in Texas.\n...,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...
2,1266555444283179008,[],[],[Monkeys on the loose in India with stolen cor...,Monkeys on the loose in India with stolen coro...,
3,1257715199655755779,"[1258212704961155073, 1257843417503141895, 125...",[@BelAkinyii When was the last time you washed...,[],,@BelAkinyii When was the last time you washed ...
4,1274338812173393920,"[1274369294558801921, 1274413352828186624, 127...",[@HeidiNBC These Trump fans have a right to at...,"[“If Trump felt comfortable having it here, th...","“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...


In [29]:
data.to_json(path_or_buf="./task2/data/processed_covid.json")

In [30]:
pd.read_json("./task2/data/processed_covid.json")

Unnamed: 0,source_id,replies_id,reply_text_list,source_text,source,replies
0,1272262651100434432,[],[],"[According to the New York Times, Warner Bros....","According to the New York Times, Warner Bros. ...",
1,1287153210990395392,[1287191952115605505],[@TexasTribune Guess what the cause of death i...,[Hurricane Hanna has made landfall in Texas.\n...,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...
2,1266555444283179008,[],[],[Monkeys on the loose in India with stolen cor...,Monkeys on the loose in India with stolen coro...,
3,1257715199655755776,"[1258212704961155073, 1257843417503141895, 125...",[@BelAkinyii When was the last time you washed...,[],,@BelAkinyii When was the last time you washed ...
4,1274338812173393920,"[1274369294558801921, 1274413352828186624, 127...",[@HeidiNBC These Trump fans have a right to at...,"[“If Trump felt comfortable having it here, th...","“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...
...,...,...,...,...,...,...
17453,1249502859185590272,"[1249578608563126272, 1249537088128573441, 124...",[@funder Wonder how many lives could have been...,[I wonder how many lives could’ve been saved i...,I wonder how many lives could’ve been saved if...,@funder Wonder how many lives could have been ...
17454,1284050414619459584,"[1284173350751928320, 1284415121192898560, 128...",[@NadineDorries @thetimes Inadequate supplies ...,[The @thetimes front page on 17th March. The f...,The @thetimes front page on 17th March. The fi...,@NadineDorries @thetimes Inadequate supplies o...
17455,1274505289614725120,[1274548079648223243],[@DNCWarRoom Fact check: Chinese is not a race...,[Trump just completed the racism trifecta in a...,Trump just completed the racism trifecta in a ...,@DNCWarRoom Fact check: Chinese is not a race....
17456,1267884642637676544,[1267892868603092994],[@Jess__Taylor__ @davidallengreen Eck! What ar...,[Here are a few of my photographs from today’s...,Here are a few of my photographs from today’s ...,@Jess__Taylor__ @davidallengreen Eck! What are...
