In [1]:
import pandas as pd

In [2]:
SOURCE_TWITTER_PATH = "./full_data/data_storage/full_train_source_only.json"
FULL_TWITTER_PATH = "./full_data/data_storage/full_train.json"
Y_TRAIN_PATH = "./id_data/train.label.txt"
X_TRAIN_PATH = "./id_data/train.data.txt"   # train
SOURCE_STORY_ONLY = True

In [3]:
def join_data_id_label( LABEL_PATH, ID_PATH, DATA_PATH, dropna_on_column="text", SOURCE_STORY_ONLY=True):
    """A function that joins data and id and their labels, return a dataframe that trims off twitter we don't have data"""
    # Process Labels
    # 1: Rumour
    # 0: NonRumour
    with open(LABEL_PATH, "r") as f:
        y_label = f.read().strip().split("\n") # remove next line
    y_label = pd.DataFrame(y_label, columns = ["label"])
    y_label[y_label["label"]=="rumour"] = 1
    y_label[y_label["label"]=="nonrumour"] = 0
    
    ## Get Dataframe Id
    total_id_list = []
    with open(ID_PATH, "r") as f:
        for line in f:
            line = line.strip() # remove next line
            if SOURCE_STORY_ONLY:
                line = line.split(',')[0] # split into list\
            else:
                line = line.split(',')
            total_id_list.append(line)
    if not SOURCE_STORY_ONLY:
        total_id_list = [item for sublist in total_id_list for item in sublist] # Flat into a single list
    total_id_list = total_id_list
    id_df = pd.DataFrame(total_id_list, columns=["id"])
    id_df["id"] = id_df["id"].astype(str)
    # join their labels
    id_df["label"] = y_label["label"]
    
    ## Get Twitter Data
    # Read Source Data
    data_df = pd.read_json(DATA_PATH)
    #source_df = pd.concat([source_df, y_train], axis=1)
    data_df["id"] = data_df["id"].astype(str)
    
    ## Join the data file and IDs we got 
    ## Drop rows we don't have data on
    combined_df = id_df.join(data_df.set_index('id'), on='id').dropna(subset=['text', 'author_id']).reset_index(drop=True)
    
    return combined_df

In [4]:
def Remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

In [5]:
# Reference: https://stackoverflow.com/questions/27084617/detect-strings-with-non-english-characters-in-python
def isEnglish(s):
    """Check if a string is english"""
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [6]:
## Remove Stopwords
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

def text_processing(input_str):
    """A function takes a input text string and output a list of preprocessed words
        1. Remove URL
        2. StopWords Removal
        3. Remove Non-English Words
        4. Remove Numeric Value
        5. Lemmatize
    
    """
    list_words = []
    # Remove URL
    input_str = Remove_URL(input_str)
    for word in re.sub(r'[^\w\s]','', input_str).split():
        # Remove Numeric Value, Non-English words and Stop words
        if (word.isalpha()) and (isEnglish(word)) and (word not in stopwords.words('english')):
            # Remove Links in the text
            # Lemmatize words
            word = lemmatizer.lemmatize(word)
            # Add to List
            list_words.append(word)
    return list_words

#filtered_words = [word for word in word_list if word not in stopwords.words('english')]

In [7]:
## Expand list of words into bag-of-words array
def get_bag_of_words(df_column):
    """Expand list of words into bag-of-words array, return the dataframe."""
    return pd.get_dummies(df_column.apply(pd.Series).stack()).groupby(level=0).sum()


In [8]:
# Load Data
X_train = join_data_id_label(Y_TRAIN_PATH, X_TRAIN_PATH, SOURCE_TWITTER_PATH)


In [9]:
print(X_train[["lang"]].value_counts())
print("============")
print(X_train[["withheld"]].value_counts())
print("============")
print(X_train[["reply_settings"]].value_counts())
print("============")
print(X_train[["source"]].value_counts())

## Only keep English! 

lang
en      1547
und        4
es         2
in         2
ro         2
hu         1
it         1
ja         1
ru         1
zh         1
dtype: int64
withheld
False       1551
True          11
dtype: int64
reply_settings
everyone          1562
dtype: int64
source                 
Twitter Web App            406
Twitter for Android        291
Twitter for iPhone         286
TweetDeck                  165
Twitter Web Client         110
                          ... 
The New York Times           1
TheLatestIs                  1
TweetCaster for Android      1
Sendible                     1
AOL Blogsmith                1
Length: 61, dtype: int64


###  Standard Scaler to do PCA to do dimension reduction

In [10]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [11]:
## Standardize, why? it needs to do to use PCA, also, it needs to have variance of 1
scaler = StandardScaler()

pca = PCA(n_components=100, random_state=1)

# Only Use Text Data
other fields including author information, entities, public matrix, and source not used!

### Process Training Data use above functions

In [12]:
## How to normalize json field in the data? 

## NOT YET USED!
#pd.json_normalize(X_train['public_metrics']).head(3)

In [13]:
# Load Data
X_train = join_data_id_label(Y_TRAIN_PATH, X_TRAIN_PATH, SOURCE_TWITTER_PATH)
## 1. Only keep english tweets as most of them are in english
## 2. Based on the values count, drop irrelevant features "reply_settings" too
X_train = X_train[X_train.lang=="en"].reset_index(drop=True)
X_train = X_train[['id','lang', 'text','label']]
# Use only text data and tokenize
X_train["clean_text"] = X_train["text"].apply(text_processing)
X_train.head(5)

Unnamed: 0,id,lang,text,label,clean_text
0,1250219300389974016,en,5. Can regularly rinsing your nose with saline...,0,"[Can, regularly, rinsing, nose, saline, help, ..."
1,554886875303780352,en,French police chief killed himself after #Char...,1,"[French, police, chief, killed, CharlieHebdo, ..."
2,1237901309011021825,en,Coronavirus disease (COVID-19) advice for the ...,0,"[Coronavirus, disease, advice, public, Should,..."
3,524958128392376320,en,Ottawa police confirm that there were multiple...,0,"[Ottawa, police, confirm, multiple, suspect, s..."
4,1239295488677085185,en,if the primary focus of a government isn't to ...,0,"[primary, focus, government, isnt, alleviate, ..."


In [14]:
# Load Data
X_train = join_data_id_label(Y_TRAIN_PATH, X_TRAIN_PATH, SOURCE_TWITTER_PATH)


## 1. Only keep english tweets as most of them are in english
## 2. Based on the values count, drop irrelevant features "reply_settings" too
X_train = X_train[X_train.lang=="en"].reset_index(drop=True)
X_train = X_train[['id','lang', 'text','label']]
# Use only text data and tokenize
X_train["clean_text"] = X_train["text"].apply(text_processing)
# Get BOF dataframe
X_train_BOW = get_bag_of_words(X_train["clean_text"])
X_train_BOW.head(5)

Unnamed: 0,A,AB,ABC,ABCNews,ABOUT,AC,ACCORDING,ACCURATE,ACTED,AFFECT,...,youd,young,younger,yourbabazg,youre,youve,zaelefty,zero,zoom,zwinst
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Get train test data
y_train = X_train["label"]
X_train = X_train_BOW

assert(len(X_train) == len(y_train))

In [16]:
## Transform using Train Data

train_columns = X_train.columns

# StandardScaler
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=train_columns)

# PCA
X_train_scaled_reduced = pd.DataFrame(pca.fit_transform(X_train_scaled))

### Dev Data Load

In [17]:
def keep_training_columns_only(test_BOW_df, train_column, default_value=0):
    """Take a BOW dataframe and keep only the words from training dataframe, return a dataframe"""
    return test_BOW_df.reindex(columns = train_columns, fill_value=0)

In [18]:
DEV_SOURCE_TWITTER_PATH = "./full_data/data_storage/full_dev_source_only.json"
#DEV_FULL_TWITTER_PATH = "./full_data/data_storage/full_dev.json"
Y_DEV_PATH = "./id_data/dev.label.txt"
X_DEV_PATH = "./id_data/dev.data.txt"

In [22]:
# Load Data
X_dev = join_data_id_label(Y_DEV_PATH, X_DEV_PATH, DEV_SOURCE_TWITTER_PATH)


## 1. Only keep english tweets as most of them are in english
## 2. Based on the values count, drop irrelevant features "reply_settings" too
X_dev = X_dev[X_dev.lang=="en"].reset_index(drop=True)
X_dev = X_dev[['id','lang', 'text','label']]
# Use only text data and tokenize
X_dev["clean_text"] = X_dev["text"].apply(text_processing)
# Get BOF dataframe
X_dev_BOW = get_bag_of_words(X_dev["clean_text"])

## Keep training columns only
X_dev_BOW = keep_training_columns_only(X_dev_BOW, train_columns, default_value=0)

X_dev_BOW.head(5)

Unnamed: 0,A,AB,ABC,ABCNews,ABOUT,AC,ACCORDING,ACCURATE,ACTED,AFFECT,...,youd,young,younger,yourbabazg,youre,youve,zaelefty,zero,zoom,zwinst
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Get train test data
y_dev = X_dev["label"]
X_dev = X_dev_BOW

assert(len(X_dev) == len(y_dev))

In [24]:
## Transform using Trained Transformer

# StandardScaler
X_dev_scaled = pd.DataFrame(scaler.transform(X_dev), columns=train_columns)
# PCA
X_dev_scaled_reduced = pd.DataFrame(pca.transform(X_dev_scaled))