In [40]:
import pandas as pd
import re

In [84]:
def convert_replies_id_to_sorted_text(list_of_id, data_df):
    """Convert a list of tweet ids to a list of its corresponding text in data_df
    Sort by tweets original create time"""
    if type(list_of_id) is not list:
        # if input is a single id convert type
        list_of_id = [list_of_id]
    # Select dataframe based on a list of ids
    selected_df = data_df[data_df['id'].astype(str).isin(list_of_id)]
    # Sort by their created time
    sorted_df = selected_df.sort_values(by = ["created_at"])
    # Select text fields only
    text_list = sorted_df["text"].to_list()
    return text_list

def convert_replies_id_to_metrics(list_of_id, data_df):
    """Convert a list of tweet ids to a list of its corresponding public_metrics in data_df
    Sort by tweets original create time"""
    if type(list_of_id) is not list:
        # if input is a single id convert type
        list_of_id = [list_of_id]
    # Select dataframe based on a list of ids
    selected_df = data_df[data_df['id'].astype(str).isin(list_of_id)]
    # Sort by their created time
    sorted_df = selected_df.sort_values(by = ["created_at"])
    # Select metric field only
    metric_list = sorted_df["public_metrics"].to_list()
    return metric_list

def remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

def join_data_id_label_v2(LABEL_PATH, ID_PATH, DATA_PATH):
    # Process Labels
    if LABEL_PATH:
        # 1: Rumour
        # 0: NonRumour
        with open(LABEL_PATH, "r") as f:
            y_label = f.read().strip().split("\n") # remove next line
        y_label = pd.DataFrame(y_label, columns = ["label"])
        y_label[y_label["label"]=="rumour"] = 1
        y_label[y_label["label"]=="nonrumour"] = 0

    ## Get Dataframe Id, with first id as source Id, and values as replies, not using dict since we have duplicated keys
    total_id_list = []
    with open(ID_PATH, "r") as f:
        for line in f:
            line = line.strip().split(',') # remove next line
            source_id = line[0]
            if len(line) > 1:
                # if we have replies id
                replies_id = line[1:]
            else:
                replies_id = []
            row = [source_id, replies_id]
            total_id_list.append(row)
    len(total_id_list)
    
    ## Create a dataframe containing a list of replies
    source_df = pd.DataFrame(total_id_list, columns = ['source_id', 'replies_id'])
    data_df = pd.read_json(DATA_PATH)
    source_df["reply_text_list"] = source_df["replies_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)
    source_df["source_text"]     = source_df["source_id"].apply(convert_replies_id_to_sorted_text, data_df = data_df)
    source_df["metrics_replies"]         = source_df["replies_id"].apply(convert_replies_id_to_metrics, data_df = data_df)
    source_df["metrics_source"]  = source_df["source_id"].apply(convert_replies_id_to_metrics, data_df = data_df)

    if LABEL_PATH:
      source_df["label"] = y_label
    return source_df

def meta_preprocess(IDS, DATA, LABELS=False):
    """Function to combine all the preprocessing steps"""
    data = join_data_id_label_v2(LABELS, IDS, DATA)
    ## 1. Only keep english tweets as most of them are in english
    # Use only text data and remove URLs
    data["source"] = data["source_text"].apply(" ".join).apply(remove_URL) 
    data["replies"] = data["reply_text_list"].apply(" ".join).apply(remove_URL)
    
    return data

In [123]:
TRAIN_DATA_PATH = "./full_data/data_storage/full_train.json"
Y_TRAIN_ID_PATH = "./id_data/train.label.txt"
X_TRAIN_ID_PATH = "./id_data/train.data.txt"   # train
data_train = meta_preprocess(X_TRAIN_ID_PATH, TRAIN_DATA_PATH, LABELS=Y_DEV_ID_PATH)

In [122]:
DEV_DATA_PATH = "./full_data/data_storage/full_dev.json"
Y_DEV_ID_PATH = "./id_data/dev.label.txt"
X_DEV_ID_PATH = "./id_data/dev.data.txt"   # dev
data_dev = meta_preprocess(X_DEV_ID_PATH, DEV_DATA_PATH, LABELS=Y_DEV_ID_PATH)

In [136]:
TEST_DATA_PATH = "./full_data/data_storage/test_data.json"
X_TEST_ID_PATH = "./id_data/test.data.txt"   # test
data_test = pd.read_json(TEST_DATA_PATH)
data_test

Unnamed: 0,created_at,id,id_str,text,truncated,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,...,retweeted,possibly_sensitive,lang,extended_entities,quoted_status_id,quoted_status_id_str,quoted_status,possibly_sensitive_appealable,withheld_in_countries,author_id
0,2020-01-17 17:53:32+00:00,1218229883484155913,1218229883484155904,Q: What is a #coronavirus?\n\nA: Coronaviruses...,True,"{'hashtags': [{'text': 'coronavirus', 'indices...",Twitter Web App,,,,...,False,0.0,en,,,,,,,14499829
1,2020-01-17 18:10:42+00:00,1218234203361480704,1218234203361480704,Q: What is a novel #coronavirus?\n\nA: A novel...,True,"{'hashtags': [{'text': 'coronavirus', 'indices...",Twitter Web App,1.218230e+18,1.218230e+18,14499829.0,...,False,0.0,en,,,,,,,14499829
2,2020-01-17 19:14:05+00:00,1218250153901076482,1218250153901076480,@WHOThailand @WHOKobe @WHOSEARO @WHOWPRO Q: Wh...,True,"{'hashtags': [{'text': 'coronavirus', 'indices...",Twitter Web App,1.218234e+18,1.218234e+18,14499829.0,...,False,0.0,en,,,,,,,14499829
3,2020-01-17 19:36:05+00:00,1218255692831903744,1218255692831903744,Q: Is there a treatment for a novel #coronavir...,True,"{'hashtags': [{'text': 'coronavirus', 'indices...",Twitter for iPhone,1.218250e+18,1.218250e+18,14499829.0,...,False,0.0,en,,,,,,,14499829
4,2020-01-17 20:30:40+00:00,1218269428166602753,1218269428166602752,Q: What can I do to protect myself from #coron...,False,"{'hashtags': [{'text': 'coronavirus', 'indices...",Twitter for iPhone,1.218256e+18,1.218256e+18,14499829.0,...,False,0.0,en,"{'media': [{'id': 1218269422605012994, 'id_str...",,,,,,14499829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8154,2015-12-27 17:24:29+00:00,681163757855338496,681163757855338496,@TMZ @Chillinlaidback @katyperry you're the be...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...",Twitter for Android,6.811478e+17,6.811478e+17,16331010.0,...,False,,en,,,,,,,8729442
8155,2015-12-27 17:52:09+00:00,681170718843899904,681170718843899904,@TMZ This story makes my heart smile. Great jo...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...",Twitter Web Client,6.811478e+17,6.811478e+17,16331010.0,...,False,,en,,,,,,,4218264718
8156,2015-12-27 22:42:47+00:00,681243858156556289,681243858156556288,@TMZ Katy s great !!!! Taylor swift would of m...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...",Twitter for iPad,6.811478e+17,6.811478e+17,16331010.0,...,False,,en,,,,,,,141024650
8157,2016-01-08 23:55:40+00:00,685610853979164672,685610853979164672,@EndlessPain89 @ComplexMag is that so?,False,"{'hashtags': [], 'symbols': [], 'user_mentions...",Twitter for iPhone,6.519601e+17,6.519601e+17,395645110.0,...,False,,en,,,,,,,1538152554


In [57]:
def get_metrics(metric_dicts):
    metrics = []
    for metric_dict in metric_dicts:
        metrics += metric_dict.values()
    return metrics

0            [0, 1, 0, 0]
1        [184, 35, 44, 0]
2            [1, 1, 4, 1]
3         [121, 5, 23, 0]
4            [1, 0, 6, 0]
              ...        
1890      [76, 30, 56, 0]
1891    [108, 36, 148, 0]
1892      [176, 9, 63, 0]
1893                   []
1894         [0, 3, 0, 0]
Name: metrics_source, Length: 1895, dtype: object

In [106]:
data_train["metrics"] = data_train["metrics_source"].apply(get_metrics)
data_train = data_train.where(cond).dropna()
data_train_meta = data_train[["metrics", "label"]]
# Get only the instances that have a metrics_source that's nonempty
cond = data_train["metrics"].apply(lambda l: len(l)>0)

In [115]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = data_train_meta["metrics"], data_train_meta["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_train.values.tolist()

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,


In [116]:
clf = LogisticRegression(random_state=0).fit(X_train.values.tolist(), y_train.values.tolist())

In [118]:
clf.score(X_test.values.tolist(), y_test.values.tolist())

0.8083067092651757