# Testing with Jupyter Notebook

just use this notebook to test out models, we'll compile it all together once everything works

In [None]:
# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from typing import List

nltk.download('stopwords')
nltk.download('punkt_tab')

In [3]:
# hyper-parameters


In [12]:
# get all the data necessary and split into input and expected output
train = pd.read_csv('./datasets/train.csv')
valid = pd.read_csv('./datasets/valid.csv')
test = pd.read_csv('./datasets/test.csv')

train_X = train.pop('text').values.tolist()
train_y = train['label'].values.tolist()
valid_X = valid.pop('text').values.tolist()
valid_y = valid['label'].values.tolist()
test_X = test.pop('text').values.tolist()
test_y = test['label'].values.tolist()

In [5]:
# looking at the data
print("Total Amount of Examples: ")
print(f" - Training Data: {np.shape(train)[0]}")
print(f" - Validation Data: {np.shape(valid)[0]}")
print(f" - Testing Data: {np.shape(test)[0]}")

print("---")

print("Labels for each Dataset")

sarcasm_df = pd.DataFrame({
    'Training': pd.Series(train_y['label']).value_counts().sort_index(),
    'Validation': pd.Series(valid_y['label']).value_counts().sort_index(),
    'Testing': pd.Series(test_y['label']).value_counts().sort_index()
})

print(sarcasm_df)

Total Amount of Examples: 
 - Training Data: 21464
 - Validation Data: 716
 - Testing Data: 966
---
Labels for each Dataset
       Training  Validation  Testing
label                               
0         11248         360      526
1         10216         356      440


## probable models so we can get good points for the competition
- SVMs
- RNNs
- ensemble all of this together (voting classifier)

In [None]:
# preprocessing (get rid of punctuation and stop words and split all phrases into a list of words)
# QUICK NOTE: NLTP REMOVES 'NOT' 'NOR' AND ALL OTHER "NEGATIVE" WORDS
def preprocess_words(phrases: List[str]) -> List[str]:
    new_phrases = []
    stop_words = set(stopwords.words('english'))
    punctuation = ["'", ",", ]
    
    for phrase in phrases:
        phrase = re.sub(r'[^\w\s\!\?\"\'\.\,]', '', phrase)
        word_tokens = word_tokenize(phrase.lower())
        
        filtered_tokens = [word for word in word_tokens if word not in stop_words]
                
        new_phrases.append(filtered_tokens)
    
    return new_phrases

In [None]:
phrases = preprocess_words(valid_X)
print(phrases)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fieryhacker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/fieryhacker/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


[['prejudice', 'discriminate'], ['entire', 'house', 'implicated', 'phish', 'poster'], ['lustful', 'man', 'sensually', 'uses', 'one', 'hand', 'unhook', 'clasp', 'takeout', 'box'], ['area', 'man', 'gets', 'terrible', 'creative', 'juices', 'flowing'], ['college', 'graduate', 'first', 'person', 'family', 'waste', '160,000'], ['woman', 'gets', 'several', 'job', 'offers', 'handing', 'resumes', 'side', 'road'], ['japan', 'calls', "'world", 'without', 'nuclear', 'weapons', "'", 'hiroshima', 'bombing', 'anniversary'], ['green', 'party', 'official', 'caught', 'embezzling', 'campaign', 'funds', 'dime', 'bag'], ['obama', 'hosts', 'annual', 'ramadan', 'iftar', 'dinner', 'white', 'house'], ['beat', 'winter', 'blues', ',', 'according', 'top', 'experts'], ['denny', "'s", 'introduces', "'just", 'humongous', 'bucket', 'eggs', 'meat', "'"], ['inside', 'stars', 'wearing', 'terrible', 'movie', "'s", 'gala', 'premiere'], ['trump', 'claims', 'substantial', 'portions', 'u.s.mexico', 'laser', 'forcefield', 'al