In [16]:
# Data Manipulation
import pandas as pd
import numpy as np

# Checkpoints
import pickle
from tqdm.notebook import trange, tqdm

# Preprocessing
import re    # RegEx for removing non-letter characters
import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import sklearn.preprocessing as pr


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliorenteria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOADING DATASET

In [2]:
data = pd.read_csv('df_1245.csv')

In [3]:
data.head()

Unnamed: 0,text,sentiment
0,TRENDING: New Yorkers encounter empty supermar...,-1
1,When I couldn't find hand sanitizer at Fred Me...,1
2,Find out how you can protect yourself and love...,1
3,#Panic buying hits #NewYork City as anxious sh...,-1
4,#toiletpaper #dunnypaper #coronavirus #coronav...,0


# TOKENIZING

Separate tweets into lists of word (or components) that carry meanings.

Removing/converting the following elements:
- Uppercase
- Urls
- Simbols
- Numbers
- English Stopwords
- Stemming

In [4]:
# Function to format, tokenize and remove stopwords from tweets.
def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove https
    text = re.sub(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)", " ", text)
    #text = " ".join(filter(lambda w: w.find("https://") == -1, text.split(" ")))
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #remove numbers
    text = re.sub(r"[0-9]*", "", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words


In [19]:
# Testing tweet_to_words Function
sel = 4
print("\nORIGINAL TWEET ->\n", data['text'][sel])
print("\nPROCESSED TWEET ->\n", tweet_to_words(data['text'][sel]))



ORIGINAL TWEET ->
 #toiletpaper #dunnypaper #coronavirus #coronavirusaustralia #CoronaVirusUpdate #Covid_19 #9News  #Corvid19 #7NewsMelb #dunnypapergate #Costco    One week everyone buying baby milk powder the next everyone buying up toilet paper. https://t.co/ScZryVvsIh

PROCESSED TWEET ->
 ['toiletpap', 'dunnypap', 'coronaviru', 'coronavirusaustralia', 'coronavirusupd', 'covid', 'news', 'corvid', 'newsmelb', 'dunnypaperg', 'costco', 'one', 'week', 'everyon', 'buy', 'babi', 'milk', 'powder', 'next', 'everyon', 'buy', 'toilet', 'paper']


In [None]:
X = []
for i in trange(len(data['text'])):
    X.append(tweet_to_words(data['text'][i]))

HBox(children=(FloatProgress(value=0.0, max=239252.0), HTML(value='')))

In [None]:
# Apply tokenization to each tweet and store as X
X = list(map(tweet_to_words, data['text']))


## 🚩 CHECKPOINT 

Saving **Tokenized List** as a Pickle File to retrieve latter and save memory and time.

In [None]:
# Saving X to a PICKLE to retrieve back latter:

# Code to Save PICKLE
with open('tokenized_tweets.pkl', 'wb') as f:
    pickle.dump(X, f)
with open('sentiments.pkl', 'wb') as f:
    pickle.dump(Y, f)


In [None]:
# Code to Retrieve PICKLE
with open('tokenized_tweets.pkl', 'rb') as f:
    X = pickle.load(f)
with open('sentiments.pkl', 'rb') as f:
    Y = pickle.load(f)


# TRAIN TEST SPLIT

In [None]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=13)

print('Number of tweets in the total set :    {}'.format(len(X)))
print('Number of tweets in the training set : {}'.format(len(X_train)))
print('Number of tweets in the testing set :  {}'.format(len(X_test)))


# BAG OF WORDS


In [None]:
# Convert a collection of text documents to a matrix of token counts
vocabulary_size = 5000    # Rounded up from 17422

# Generating Bag of Words
# Tweets have already been preprocessed hence dummy function will be passed in to preprocessor & tokenizer step
count_vector = CountVectorizer(max_features=vocabulary_size, preprocessor=lambda x: x, tokenizer=lambda x: x)
count_vector.fit(X_train)


In [None]:
# CountVectorizer creates a vocabulary. Checking Vocabulary
print('VOCABULARY SIZE: ', len(count_vector.vocabulary_))
print('VOCABULARY CONTENT: ', count_vector.vocabulary_)


In [None]:
# Transform the training data
X_train_bow = count_vector.transform(X_train).toarray()

# Transform the testing data
X_test_bow = count_vector.transform(X_test).toarray()


## Experiments from here on out
