In [40]:
import sklearn
import pandas as pd
import re
TRAIN_DATA_PATH = "./full_data/data_storage/full_train.json"
Y_TRAIN_ID_PATH = "./id_data/train.label.txt"
X_TRAIN_ID_PATH = "./id_data/train.data.txt"   # train

DEV_DATA_PATH = "./full_data/data_storage/full_dev.json"
Y_DEV_ID_PATH = "./id_data/dev.label.txt"
X_DEV_ID_PATH = "./id_data/dev.data.txt"   # dev

TEST_DATA_PATH = "./full_data/data_storage/test_full_v2.json"
X_TEST_ID_PATH = "./id_data/test.data.txt"   # test

In [48]:
def convert_replies_id_to_sorted_text(list_of_id, data_df):
    """Convert a list of tweet ids to a list of its corresponding text in data_df
    Sort by tweets original create time"""
    if type(list_of_id) is not list:
        # if input is a single id convert type
        list_of_id = [list_of_id]
    # Select dataframe based on a list of ids
    selected_df = data_df[data_df['id'].astype(str).isin(list_of_id)]
    # Sort by their created time
    sorted_df = selected_df.sort_values(by = ["created_at"])
    # Select text fields only
    text_list = sorted_df["text"].to_list()
    return text_list

def convert_replies_id_to_metrics(list_of_id, data_df):
    """Convert a list of tweet ids to a list of its corresponding public_metrics in data_df
    Sort by tweets original create time"""
    if type(list_of_id) is not list:
        # if input is a single id convert type
        list_of_id = [list_of_id]
    # Select dataframe based on a list of ids
    selected_df = data_df[data_df['id'].astype(str).isin(list_of_id)]
    # Sort by their created time
    sorted_df = selected_df.sort_values(by = ["created_at"])
    # Select metric field only
    metric_list = sorted_df["public_metrics"].to_list()
    return metric_list

def remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

def join_data_id_label_v2(LABEL_PATH, ID_PATH, DATA_PATH):
    # Process Labels
    if LABEL_PATH:
        # 1: Rumour
        # 0: NonRumour
        with open(LABEL_PATH, "r") as f:
            y_label = f.read().strip().split("\n") # remove next line
        y_label = pd.DataFrame(y_label, columns = ["label"])
        y_label[y_label["label"]=="rumour"] = 1
        y_label[y_label["label"]=="nonrumour"] = 0

    ## Get Dataframe Id, with first id as source Id, and values as replies, not using dict since we have duplicated keys
    total_id_list = []
    with open(ID_PATH, "r") as f:
        for line in f:
            line = line.strip().split(',') # remove next line
            source_id = line[0]
            if len(line) > 1:
                # if we have replies id
                replies_id = line[1:]
            else:
                replies_id = []
            row = [source_id, replies_id]
            total_id_list.append(row)
    len(total_id_list)
    
    ## Create a dataframe containing a list of replies
    source_df = pd.DataFrame(total_id_list, columns = ['source_id', 'replies_id'])
    data_df = pd.read_json(DATA_PATH)
    source_df["reply_text_list"] = source_df["replies_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)
    source_df["source_text"]     = source_df["source_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)
    source_df["metrics"]         = source_df["replies_id"].apply(convert_replies_id_to_metrics, data_df = data_df)
    source_df["metrics_source"]  = source_df["source_id"].apply(convert_replies_id_to_metrics, data_df = data_df)

    if LABEL_PATH:
      source_df["label"] = y_label
    return source_df

def meta_preprocess(IDS, DATA, LABELS=False):
    """Function to combine all the preprocessing steps"""
    data = join_data_id_label_v2(LABELS, IDS, DATA)
    ## 1. Only keep english tweets as most of them are in english
    # Use only text data and remove URLs
    data["source"] = data["source_text"].apply(" ".join).apply(remove_URL) 
    data["replies"] = data["reply_text_list"].apply(" ".join).apply(remove_URL)
    
    
    if LABELS:
      data = data[['source','replies', 'metrics_source', 'metrics','label']]
    else:
      data = data[['source','replies', 'metrics_source', 'metrics']]
    
    return data

In [49]:
data_train = meta_preprocess(X_TRAIN_ID_PATH, TRAIN_DATA_PATH, LABELS=Y_TRAIN_ID_PATH)
data_train

Unnamed: 0,source,replies,metrics_source,metrics,label
0,5. Can regularly rinsing your nose with saline...,4. Can eating garlic help prevent infection wi...,"[{'retweet_count': 0, 'reply_count': 1, 'like_...","[{'retweet_count': 0, 'reply_count': 1, 'like_...",0
1,French police chief killed himself after #Char...,@Telegraph How very sad. @Telegraph @Telegraph...,"[{'retweet_count': 184, 'reply_count': 35, 'li...","[{'retweet_count': 0, 'reply_count': 0, 'like_...",1
2,Coronavirus disease (COVID-19) advice for the ...,Infection control for suspected or confirmed C...,"[{'retweet_count': 1, 'reply_count': 1, 'like_...","[{'retweet_count': 0, 'reply_count': 1, 'like_...",0
3,Ottawa police confirm that there were multiple...,@WSJ Killers go berserk when cornered. Hencef...,"[{'retweet_count': 121, 'reply_count': 5, 'lik...","[{'retweet_count': 1, 'reply_count': 1, 'like_...",0
4,if the primary focus of a government isn't to ...,,"[{'retweet_count': 1, 'reply_count': 0, 'like_...",[],0
...,...,...,...,...,...
1890,Desperate Ted Cruz Claims Planned Parenthood S...,@Bipartisanism \nDesperate! @Bipartisanism Cr...,"[{'retweet_count': 76, 'reply_count': 30, 'lik...","[{'retweet_count': 0, 'reply_count': 0, 'like_...",1
1891,"""Thoughts and prayers are not enough."" Pres. O...",.@ABC has anyone else noticed mass shootings s...,"[{'retweet_count': 108, 'reply_count': 36, 'li...","[{'retweet_count': 0, 'reply_count': 1, 'like_...",1
1892,Police have surrounded this building where the...,@NBCNews bury them in their hole @NBCNews @Wik...,"[{'retweet_count': 176, 'reply_count': 9, 'lik...","[{'retweet_count': 0, 'reply_count': 0, 'like_...",0
1893,,@Kirstenjoyweiss @MattFabrication @prestone85 ...,[],"[{'retweet_count': 0, 'reply_count': 1, 'like_...",0


In [51]:
data_train[["metrics_source", "metrics", "label"]]

Unnamed: 0,metrics_source,metrics,label
0,"[{'retweet_count': 0, 'reply_count': 1, 'like_...","[{'retweet_count': 0, 'reply_count': 1, 'like_...",0
1,"[{'retweet_count': 184, 'reply_count': 35, 'li...","[{'retweet_count': 0, 'reply_count': 0, 'like_...",1
2,"[{'retweet_count': 1, 'reply_count': 1, 'like_...","[{'retweet_count': 0, 'reply_count': 1, 'like_...",0
3,"[{'retweet_count': 121, 'reply_count': 5, 'lik...","[{'retweet_count': 1, 'reply_count': 1, 'like_...",0
4,"[{'retweet_count': 1, 'reply_count': 0, 'like_...",[],0
...,...,...,...
1890,"[{'retweet_count': 76, 'reply_count': 30, 'lik...","[{'retweet_count': 0, 'reply_count': 0, 'like_...",1
1891,"[{'retweet_count': 108, 'reply_count': 36, 'li...","[{'retweet_count': 0, 'reply_count': 1, 'like_...",1
1892,"[{'retweet_count': 176, 'reply_count': 9, 'lik...","[{'retweet_count': 0, 'reply_count': 0, 'like_...",0
1893,[],"[{'retweet_count': 0, 'reply_count': 1, 'like_...",0


In [56]:
def get_metrics(metric_dicts):
    metrics = []
    for metric_dict in metric_dicts:
        metrics += metric_dict.values()

data_train["metrics_source"].apply(get_metrics)

0       None
1       None
2       None
3       None
4       None
        ... 
1890    None
1891    None
1892    None
1893    None
1894    None
Name: metrics_source, Length: 1895, dtype: object