In [1]:
import re
import string
import contractions
from dataset import Dataset
from collections import defaultdict
from dataset_constants import *
import pickle

In [162]:
# Run when dataset_constants has been updated

# import importlib
# import dataset_constants
# importlib.reload(dataset_constants) # reload to update changes to the file
# from dataset_constants import *

In [156]:
# Run when dataset.py has been updated

# import importlib
# import dataset
# importlib.reload(dataset) # reload to update changes to the file
# from dataset import Dataset

## Load 2022 Dataset

In [5]:
# install datasets lib
import sys
!{sys.executable} -m pip install -q datasets

In [6]:
from datasets import load_dataset

dataset = load_dataset("mapsoriano/2016_2022_hate_speech_filipino")

Downloading readme:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.44M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/306k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21773 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2810 [00:00<?, ? examples/s]

In [13]:
# Access the train, validation, and test splits
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Print the size of each split to verify
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 21773
Validation dataset size: 2800
Test dataset size: 2810


In [17]:
X_train = train_dataset['text']
Y_train = train_dataset['label']

X_val = validation_dataset['text']
Y_val = validation_dataset['label']

X_test = test_dataset['text']
Y_test = test_dataset['label']

In [25]:
X = X_train + X_val + X_test
Y = Y_train + Y_val + Y_test

In [27]:
file_path = 'uncleaned_2022dataset.pkl'
with open(file_path, "wb") as f:
    pickle.dump((X, Y), f)
    print(f"Data saved to {file_path}")

Data saved to uncleaned_2022dataset.pkl


In [38]:
uncleaned_2022dataset = Dataset(full_data_path= file_path,
    from_scratch=False,
    split_sizes = [21773,2800,2810])
uncleaned_2022dataset.build()

X = uncleaned_2022dataset.get_features()
Y = uncleaned_2022dataset.get_labels()
X_train = uncleaned_2022dataset.get_features(split_type="train")
Y_train = uncleaned_2022dataset.get_labels(split_type="train")
X_val = uncleaned_2022dataset.get_features(split_type="val")
Y_val = uncleaned_2022dataset.get_labels(split_type="val")
X_test = uncleaned_2022dataset.get_features(split_type="test")
Y_test = uncleaned_2022dataset.get_labels(split_type="test")

Data loaded from uncleaned_2022dataset.pkl


# Load Dataset

In [144]:
dataset = Dataset(train_path=TRAIN_DATASET_PATH,
                  val_path=VALIDATION_DATASET_PATH, 
                  test_path=TEST_DATASET_PATH)
dataset.build()

In [30]:
X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")

print(X_train[:5])
print(Y_train[:5])
print(len(X_train))
print(len(Y_train))

['GASTOS NI VP BINAY SA POLITICAL ADS HALOS P7-M NA\r\rInaasahan na ni Vice President Jejomar Binay na may mga taong... https://t.co/SDytgbWiLh', 'Mar Roxas TANG INA TUWID NA DAAN DAW .. EH SYA NGA DI STRAIGHT', 'Salamat sa walang sawang suporta ng mga taga makati! Ang Pagbabalik Binay In Makati #OnlyBinayInMakatiSanKaPa https://t.co/iwAOdtZPRE', '@rapplerdotcom putangina mo binay TAKBO PA', 'Binay with selective amnesia, forgetting about the past six years he spent preparing to be president.  #PiliPinasDebates2016']
[0, 1, 0, 1, 0]
10000
10000


In [31]:
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")

print(X_val[:5])
print(Y_val[:5])
print(len(X_val))
print(len(Y_val))

['Escudero denies betraying Poe after meeting with Binay |https://t.co/sKlXTIhHJa - Kare-kare at sinampalukan ang topic. Walang balimbing?', 'Hndi ko makita yung sa one more chance saka kay binay sa fb. Haist.', "Mar Roxas is now addressing the crowd gathered at Pasay City's Ulat sa Barangay 2016  https://t.co/VruZyJ2e2H", '@ImYourBaeMax perfect! Para makaharap ni Duterte ang mga Binay at makatikim ng mura #^%* i#*', '#OnlyBinayPriority4Ps Wag nating hayaan na maloko tayo ng mga pulitikong yan. Kay Binay na tayo']
[0, 1, 0, 0, 0]
4232
4232


In [32]:
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

print(X_test[:5])
print(Y_test[:5])
print(len(X_test))
print(len(Y_test))

['Unshaded votes and votes for Mayor Duterte goes to Mar Roxas according to some reports of ballot tests.  #AyawSaDILAW', 'Na-Binay ??????\r#NoMoreChance https://t.co/msaaUGv0bS', "@itsmanj well there's other good choices like Duterte or Poe. But both of them are still undecided, I think? :( :(", 'Nognog. Pandak. Laki sa hirap. Pero corrupt. Yan si Binay!!!', 'Ex-Binay aide turns tables on Mercado | https://t.co/nyySAo54rL']
[1, 1, 0, 1, 0]
4232
4232


In [145]:
X = dataset.get_features()
Y = dataset.get_labels()
print(len(X))
print(len(Y))
print(X[:5])
print(Y[:5])

18464
18464
['GASTOS NI VP BINAY SA POLITICAL ADS HALOS P7-M NA\r\rInaasahan na ni Vice President Jejomar Binay na may mga taong... https://t.co/SDytgbWiLh', 'Mar Roxas TANG INA TUWID NA DAAN DAW .. EH SYA NGA DI STRAIGHT', 'Salamat sa walang sawang suporta ng mga taga makati! Ang Pagbabalik Binay In Makati #OnlyBinayInMakatiSanKaPa https://t.co/iwAOdtZPRE', '@rapplerdotcom putangina mo binay TAKBO PA', 'Binay with selective amnesia, forgetting about the past six years he spent preparing to be president.  #PiliPinasDebates2016']
[0, 1, 0, 1, 0]


# Helper Functions

In [46]:
def count_texts(texts, pattern_type):
    # Define the regex pattern
    if pattern_type == 'url':
        pattern = r'https?://\S+|www\.\S+'
    elif pattern_type == 'username':
        # Define the patterns
        username_pattern = r'@\S+'  # Matches usernames starting with @
        placeholder_pattern = r'\[USERNAME\]'  # Matches [USERNAME]

        # Combine the patterns using | (OR)
        pattern = f'({username_pattern}|{placeholder_pattern})'
    elif pattern_type == 'numeric':
        pattern = r'\b\d+\b'
    elif pattern_type == 'html_tags':
        pattern = r'<.*?>+'
    elif pattern_type == 'newlines':
        pattern = r'[\r\n]'
    elif pattern_type == 'punctuation':
        pattern = r'[’—‘`%s]' % re.escape(string.punctuation)
    elif pattern_type == 'rt':
        pattern = r'\b(rt|RT)\b'
    elif pattern_type == 'possessive':
        pattern = r"('|’)s\b"
    elif pattern_type == 'haha':
        pattern = r'haha'
    elif pattern_type == 'hashtag':
        pattern = r'#\w+'
    elif pattern_type == 'emoji':
        pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # emoticons
                               "\U0001F300-\U0001F5FF"  # symbols & pictographs
                               "\U0001F680-\U0001F6FF"  # transport & map symbols
                               "\U0001F700-\U0001F77F"  # alchemical symbols
                               "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               "\U0001FA00-\U0001FA6F"  # Chess Symbols
                               "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               "\U00002702-\U000027B0"  # Dingbats
                               "\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)


    count = 0

    # Iterate through each text and check for the presence of the pattern
    for text in texts:
        if re.search(pattern, text):
            count += 1
    
    return count

In [47]:
def clean_texts(texts, pattern_type):
    # Define the regex pattern
    if pattern_type == 'url':
        pattern = r'https?://\S+|www\.\S+'
    elif pattern_type == 'username':
        # Define the patterns
        username_pattern = r'@\S+'  # Matches usernames starting with @
        placeholder_pattern = r'\[USERNAME\]'  # Matches [USERNAME]

        # Combine the patterns using | (OR)
        pattern = f'({username_pattern}|{placeholder_pattern})'
        return [re.sub(pattern, ' ', text) for text in texts]
    
    elif pattern_type == 'numeric':
        pattern = r'\b\d+\b'
    elif pattern_type == 'html_tags':
        pattern = r'<.*?>+'
    elif pattern_type == 'newlines':
        pattern = r'[\r\n]'
        return [re.sub(pattern, ' ', text) for text in texts]
    elif pattern_type == 'punctuation':
        pattern = r'[’—‘`%s]' % re.escape(string.punctuation)
        return [re.sub(pattern, ' ', text) for text in texts]
    elif pattern_type == 'rt':
        pattern = r'\b(rt|RT)\b'
    elif pattern_type == 'possessive':
        pattern = r"('|’)s\b"
    elif pattern_type == 'hashtag':
        pattern = r'#\w+'
    elif pattern_type == 'haha':
        pattern = r'haha'
        return [re.sub(pattern, 'haha', text) for text in texts]

    return [re.sub(pattern, '', text) for text in texts]

In [48]:
custom_contractions = {
    "gov't": "government",
    "s'ya": "siya",
    "sya": "siya",
    "sa'yo": "sa iyo",
    "ika'y": "ikaw ay",
    "everybody's": "everybody is",
    "mo'ko": "mo ako",
    "ba't": "bakit",
    "sila'y": "sila ay",
    "aba'y": "aba ay",
    "ito'y": "ito ay",
    "mgm't": "management",
    "shut'up": "shut up",
    "you're": "you are",
    "umano'y": "umano ay",
    "kaya't": "kaya at",
    "n'ya": "niya",
    "le'me": "let me",
    "c'mon": "common",
    "isa't": "isa at",
    "ako'y": "ako ay",
    "toyo't": "toyo at",
    "na'to": "na ito",
    "n'yo": "niyo"
}

def expand_all_contractions(text, custom_dict):
    # First, expand using the default package
    expanded_text = contractions.fix(text)
    # Now apply custom contractions
    for key, value in custom_dict.items():
        expanded_text = expanded_text.replace(key, value)
    return expanded_text

def count_contractions(texts):
    # Regular expression to match contractions
    contraction_pattern = r"\b\w+['’]\w+\b"
    count = 0

    # Iterate through each text and count contractions
    for text in texts:
        # Find all instances of the pattern
        contracted_words = re.findall(contraction_pattern, text)
        count += len(contracted_words)
    
    return count

In [49]:
replacements = {
    r'\btang\s+ina\b': 'tangina',
    r'\bwtf\b': 'what the fuck',
    r'\bt@ng@\b': 'tanga',
    r'\bt@ng!n@\b': 'tangina',
    r'p\*\*\*\*\* i\*\*': 'tangina',  
    r'\bputangina\b': 'tangina',
    r'\bpota\b': 'puta'
}

def replace_custom_words(text, replacements):
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

# Function to count occurrences of custom words
def contains_any_pattern(text, patterns):
    # Check if any of the patterns exist in the text
    for pattern in patterns.keys():
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

def count_tweets_with_patterns(tweets, patterns):
    count = 0
    for tweet in tweets:
        if contains_any_pattern(tweet, patterns):
            count += 1
    return count

In [50]:
# Function to read stopwords from file and save into a list
def read_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords_list = [word.strip() for word in file.readlines()]
    return stopwords_list

# Read English stopwords
english_stopwords = read_stopwords(STOPWORDS_ENGLISH_PATH)

# Read Tagalog stopwords
tagalog_stopwords = read_stopwords(STOPWORDS_TAGALOG_PATH)

# Add custom stopwords - determined based on most common words (and contractions of existing stopwords)
custom_stopwords = ['si','kay','lang','yung','wag','ba','yan','iyan','kayo','pag','naman','mo','niyo','nung','kang','tong','nalang']

# Combine stopwords
combined_stopwords = set(english_stopwords + tagalog_stopwords + custom_stopwords)

# Specify words you don't want to consider as stopwords
exclude_words = {'not','di','hindi','wala'}

# Update the combined stopwords set to exclude certain words
combined_stopwords = combined_stopwords - exclude_words


In [51]:
def remove_stopwords(text, stopwords):
    # Split the text into words
    words = text.split()
    # Filter out the stopwords
    filtered_words = [word for word in words if word not in stopwords]
    # Join the filtered words back into a single string
    return ' '.join(filtered_words)

In [52]:
def remove_duplicates(X, y):
    # Create a dictionary to store indices of non-unique tweets
    non_unique_indices = defaultdict(list)

    # Iterate over the feature data X and store the indices of non-unique tweets
    for i, tweet in enumerate(X):
        non_unique_indices[tweet].append(i)

    # Identify the indices of duplicates
    non_unique_tweets_indices = [indices[0] for indices in non_unique_indices.values() if len(indices) > 1] # get first occurrence of nonunique tweets
    duplicate_indices = [indices[:-1] for indices in non_unique_indices.values() if len(indices) > 1] 
        # we do indices[:-1] so the LAST occurrence will not be removed (we take the last occurrence instead of the first so we remove as little as possible from the validation and test sets)

    # Flatten the list of duplicate indices
    duplicate_indices = [idx for sublist in duplicate_indices for idx in sublist]

    # Remove duplicates from X and y
    X_unique = [X[i] for i in range(len(X)) if i not in duplicate_indices]
    y_unique = [y[i] for i in range(len(y)) if i not in duplicate_indices]

    # Get non unique tweets
    non_unique_tweets = [X[i] for i in non_unique_tweets_indices]

    # Now, count how many removed items belong to each set
    removed_train_count = sum(1 for idx in duplicate_indices if idx < split_sizes[0])
    removed_validation_count = sum(1 for idx in duplicate_indices if split_sizes[0] <= idx < split_sizes[0] + split_sizes[1])
    removed_test_count = sum(1 for idx in duplicate_indices if split_sizes[0] + split_sizes[1] <= idx)
    total_removed = removed_train_count + removed_validation_count + removed_test_count

    print(f"Removed {total_removed} non-unique tweets.")
    print("Removed from train set:", removed_train_count)
    print("Removed from validation set:", removed_validation_count)
    print("Removed from test set:", removed_test_count)

    return X_unique, y_unique, non_unique_tweets


In [127]:
def remove_and_count_candidate_names(tweets):
    # List of words to remove
    words_to_remove = {
        "jejomar", "binay", "mar", "roxas", "rodrigo", 
        "duterte", "grace", "poe", "miriam", "defensor", "santiago"
    }
    
    # Initialize a counter for the occurrences of these words
    count = 0
    
    # Define a regex pattern to match any of the words in the list
    pattern = re.compile(r'\b(' + '|'.join(words_to_remove) + r')\b', re.IGNORECASE)
    
    # Process each tweet
    cleaned_tweets = []
    for tweet in tweets:
        # Count occurrences of words in the tweet
        found_words = pattern.findall(tweet)
        count += len(found_words)
        
        # Remove all instances of the specified words
        cleaned_tweet = pattern.sub('', tweet)
        cleaned_tweet = re.sub(r'\s+', ' ', cleaned_tweet).strip()  # Normalize spaces
        cleaned_tweets.append(cleaned_tweet)
    
    return cleaned_tweets, count

In [62]:
def cleaning_pipeline(X, Y):
    # Remove newlines
    print(f"Text with newlines: {count_texts(X, 'newlines')}")
    X_newlines_removed = clean_texts(X, 'newlines')
    print(f"Removed newlines. New count of text with newlines: {count_texts(X_newlines_removed, 'newlines')}\n")

    # Remove URLs
    print(f"Text with URLs: {count_texts(X_newlines_removed, 'url')}")
    X_url_removed = clean_texts(X_newlines_removed, 'url')
    print(f"Removed URLs. New count of text with URLs: {count_texts(X_url_removed, 'url')}\n")

    # Remove usernames
    print(f"Text with usernames: {count_texts(X_url_removed, 'username')}")
    X_username_removed = clean_texts(X_url_removed, 'username')
    print(f"Removed usernames. New count of text with usernames: {count_texts(X_username_removed, 'username')}\n")

    # Remove words that are completely numbers
    print(f"Text with numeric words: {count_texts(X_username_removed, 'numeric')}")
    X_numeric_removed = clean_texts(X_username_removed, 'numeric')
    print(f"Removed numeric words. New count of text with numeric words: {count_texts(X_numeric_removed, 'numeric')}\n")

    # Remove HTML tags
    print(f"Text with HTML tags: {count_texts(X_numeric_removed, 'html_tags')}")
    X_html_removed = clean_texts(X_numeric_removed, 'html_tags')
    print(f"Removed HTML tags. New count of text with HTML tags: {count_texts(X_html_removed, 'html_tags')}\n")

    # Lowercase all texts
    X_lowercased = [text.lower() for text in X_html_removed]
    print(f"Converted all texts to lowercase.\n")

    # Remove "rt" from tweets
    print(f"Text with 'RT': {count_texts(X_lowercased, 'rt')}")
    X_rt_removed = clean_texts(X_lowercased, 'rt')
    print(f"Removed 'RT'. New count of text with 'RT': {count_texts(X_rt_removed, 'rt')}\n")

    # Shorten all variations of "haha"
    print(f"Text with variations of 'haha': {count_texts(X_rt_removed, 'haha')}")
    X_haha_removed = clean_texts(X_rt_removed, 'haha')
    print(f"Shortened all variations of 'haha'. New count of text with 'haha': {count_texts(X_haha_removed, 'haha')}\n")

    # Remove hashtags
    print(f"Text with hashtags: {count_texts(X_haha_removed, 'hashtag')}")
    X_hashtags_removed = clean_texts(X_haha_removed, 'hashtag')
    print(f"Removed hashtags. New count of text with hashtags: {count_texts(X_hashtags_removed, 'hashtag')}\n")

    # Expand contractions
    print(f"Identified {count_contractions(X_hashtags_removed)} contractions. Attempting to expand some of them.")
    X_expanded = [expand_all_contractions(text, custom_contractions) for text in X_hashtags_removed]
    print(f"Expanded contractions. Remaining contractions: {count_contractions(X_expanded)} (Some are possessive.)\n")

    # Remove possessives
    print(f"Text with possessives: {count_texts(X_expanded, 'possessive')}")
    X_no_possessives = clean_texts(X_expanded, 'possessive')
    print(f"Removed possessives. New count of text with possessives: {count_texts(X_no_possessives, 'possessive')}\n")

    # Remove punctuations - CAN CONSIDER NOT REMOVING
    print(f"Text with punctuations: {count_texts(X_no_possessives, 'punctuation')}")
    X_punctuation_removed = clean_texts(X_no_possessives, 'punctuation')
    print(f"Removed punctuations. New count of text with punctuations: {count_texts(X_punctuation_removed, 'punctuation')}\n")

   # Replace custom words and phrases
    print(f"Count of tweets containing words to be replaced: {count_tweets_with_patterns(X_punctuation_removed, replacements)}")
    X_custom_replaced = [replace_custom_words(text, replacements) for text in X_punctuation_removed]
    print(f"Count after custom replacements: {count_tweets_with_patterns(X_custom_replaced, replacements)}\n")

    # Remove stop words - both English and Filipino
    X_no_stopwords = [remove_stopwords(text, combined_stopwords) for text in X_custom_replaced]
    print("Removed stopwords from all texts.\n")
    # return X_no_stopwords, Y

    # Remove non-unique tweets
    X_unique, Y_unique, non_unique_tweets = remove_duplicates(X_no_stopwords, Y)
    return X_unique, Y_unique

    # # Remove candidate names
    # cleaned_tweets, count = remove_and_count_candidate_names(X_unique)
    # print(f"\nRemoved candidate names. Count of removed words: {count}")
    # return cleaned_tweets, Y_unique

Other cleaning steps that can be explored:
- Consider preserving certain punctuations?
- Spelling Corrections
- Handling Slangs and Abbreviations
- Stemming? (reduce words to their root word)
- Handling emojis and special characters (found no emojis)

# Data Cleaning

In [63]:
cleaned_X, cleaned_Y = cleaning_pipeline(X,Y)

Text with newlines: 11
Removed newlines. New count of text with newlines: 0

Text with URLs: 32
Removed URLs. New count of text with URLs: 0

Text with usernames: 8661
Removed usernames. New count of text with usernames: 0

Text with numeric words: 1216
Removed numeric words. New count of text with numeric words: 0

Text with HTML tags: 0
Removed HTML tags. New count of text with HTML tags: 0

Converted all texts to lowercase.

Text with 'RT': 696
Removed 'RT'. New count of text with 'RT': 0

Text with variations of 'haha': 1949
Shortened all variations of 'haha'. New count of text with 'haha': 1949

Text with hashtags: 7
Removed hashtags. New count of text with hashtags: 0

Identified 2803 contractions. Attempting to expand some of them.
Expanded contractions. Remaining contractions: 999 (Some are possessive.)

Text with possessives: 761
Removed possessives. New count of text with possessives: 0

Text with punctuations: 18774
Removed punctuations. New count of text with punctuations: 

In [64]:
print(len(cleaned_X))
print(len(cleaned_Y))

26418
26418


In [65]:
file_path = 'cleaned_2022dataset_v1.pkl'
with open(file_path, "wb") as f:
    pickle.dump((cleaned_X, cleaned_Y), f)
    print(f"Data saved to {file_path}")

Data saved to cleaned_2022dataset_v1.pkl


In [None]:
dataset.save_to_file(cleaned_X, cleaned_Y, 'cleaned_dataset_v2.pkl')

# Load Cleaned Dataset from Path

New train-val-test split after data cleaning: 
- Train: 10000-1007 = 8993
- Validation: 4232-275 = 3957
- Test: 4232-133 = 4099

In [166]:
cleaned_dataset = Dataset(full_data_path= 'cleaned_dataset_v1.pkl',
                  from_scratch=False,
                  split_sizes = [8993,3957,4099])
cleaned_dataset.build()

Data loaded from cleaned_dataset_v1.pkl


In [167]:
X = cleaned_dataset.get_features()
Y = cleaned_dataset.get_labels()

print(X[:5])
print(Y[:5])
print(len(X))
print(len(Y))

['gastos vp binay political ads halos p7 inaasahan vice president jejomar binay taong', 'mar roxas tangina tuwid daan daw eh nga di straight', 'salamat sawang suporta taga makati pagbabalik binay makati', 'tangina binay takbo', 'binay selective amnesia forgetting past six years spent preparing president']
[0, 1, 0, 1, 0]
17049
17049


In [168]:
X_train = cleaned_dataset.get_features(split_type="train")
Y_train = cleaned_dataset.get_labels(split_type="train")

print(X_train[:5])
print(Y_train[:5])
print(len(X_train))
print(len(Y_train))

['gastos vp binay political ads halos p7 inaasahan vice president jejomar binay taong', 'mar roxas tangina tuwid daan daw eh nga di straight', 'salamat sawang suporta taga makati pagbabalik binay makati', 'tangina binay takbo', 'binay selective amnesia forgetting past six years spent preparing president']
[0, 1, 0, 1, 0]
8993
8993


In [169]:
X_val = cleaned_dataset.get_features(split_type="val")
Y_val = cleaned_dataset.get_labels(split_type="val")

print(X_val[:5])
print(Y_val[:5])
print(len(X_val))
print(len(Y_val))

['hndi one chance saka binay fb haist', 'mar roxas addressing crowd gathered pasay city ulat barangay', 'perfect makaharap duterte binay makatikim mura', 'nating hayaan maloko pulitikong binay', 'regards advertistment binay haha']
[1, 0, 0, 0, 0]
3957
3957


In [170]:
X_test = cleaned_dataset.get_features(split_type="test")
Y_test = cleaned_dataset.get_labels(split_type="test")

print(X_test[:5])
print(Y_test[:5])
print(len(X_test))
print(len(Y_test))

['unshaded votes votes mayor duterte goes mar roxas according reports ballot tests', 'well good choices like duterte poe still undecided think', 'nognog pandak laki hirap corrupt binay', 'ex binay aide turns tables mercado', 'bayan muna everydayiloveyou blogcon tomiho momentwithaimi andiloveyouso kenzo abby binay pht']
[1, 0, 1, 0, 0]
4099
4099
