# Initial Setup

In [1]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
!pip install wordcloud
from wordcloud import WordCloud
import nltk
import nltk.corpus
from nltk.tokenize import (word_tokenize,
                           sent_tokenize,
                           TreebankWordTokenizer,
                           wordpunct_tokenize,
                           TweetTokenizer,
                           SpaceTokenizer,
                           MWETokenizer)
nltk.download('punkt_tab')
from google.colab import drive
drive.mount('/content/drive/')
import re # Regular Expression
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import gensim.downloader as gensim_api



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Mounted at /content/drive/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Load the twitter set and test if it works

In [2]:
# To load the example data set, not that you might need to change the file path to where you save the tweeter_training.csv. The data set is also avaiable on canvas for download
tw_df = pd.read_csv('/content/drive/MyDrive/2024 Spring/Text Mining/TurnInFiles/Project1/tweeter_training.csv', encoding='ISO-8859-1', header=None)
# Add column names
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tw_df.columns = column_names
tw_df = shuffle(tw_df, random_state=24).reset_index(drop=True)
tw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [3]:
tw_df.head(25)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1991032647,Mon Jun 01 06:16:02 PDT 2009,NO_QUERY,LeilaLooo,"just saw Angels and Demons last night, quality..."
1,0,1976257495,Sat May 30 16:56:01 PDT 2009,NO_QUERY,ddwalker,aaaaaaaaaaaaaand she's gone.
2,0,2228371488,Thu Jun 18 14:21:27 PDT 2009,NO_QUERY,PandaChanda,Getting ready to go to my high school's gradua...
3,4,2176174571,Mon Jun 15 03:07:39 PDT 2009,NO_QUERY,ElizabethChingo,Bed time. He has a name. Ryan
4,0,1559252321,Sun Apr 19 10:39:39 PDT 2009,NO_QUERY,drummer_dan,Senses that all is over for the quintuple chan...
5,0,1978951926,Sun May 31 01:03:16 PDT 2009,NO_QUERY,janine_j9,@TimothyH2O theyre hellllllllllllla good! yes ...
6,4,2191045455,Tue Jun 16 04:23:22 PDT 2009,NO_QUERY,jennyrevelle,@AdamRPhoto ...which was tiring but awesome! H...
7,0,2321691453,Wed Jun 24 21:25:28 PDT 2009,NO_QUERY,SallytheShizzle,@BrittGoosie yeah i wouldn't advice you to get...
8,4,2054438913,Sat Jun 06 07:21:13 PDT 2009,NO_QUERY,AlbertoConde777,Christ alone can bring lasting peace - peace w...
9,0,2189574450,Tue Jun 16 00:21:38 PDT 2009,NO_QUERY,ilove_joe_jonas,@jonasbrothers http://twitpic.com/7gowf - LOL ...


# Make Preprocessor

In [4]:
def my_preprocessor(text):
  """
  Parameters:
    text: (str)

  Changes:
    Converts text to lowercase
    Removed users (@_____)
    Removed links (http_____, https______)
    Removed numbers
    Removed stop words (english and spanish)
    Removed punctuation
    Lematizes and # Corrects spelling errors

  """

  lemmatizer = WordNetLemmatizer()

  # Makes text lowercase
  text = text.lower()

  # Remove users
  text = re.sub(r'@\w+', '', text)

  # Remove links
  text = re.sub(r'http\S+|https\S+', '', text)

  # Remove numbers
  text = re.sub(r'\d+', '', text)

  # Fix spelling errors
  # text = str(TextBlob(text).correct())

  # Split text into words
  tokens = word_tokenize(text)

  # Define stop words
  stop_words = set(stopwords.words('english')) | set(stopwords.words('spanish'))

  # Removes stopwords, punctuation, contractions, and dots
  stopunct_tokens = []
  for token in tokens:
    if token not in stop_words and token not in string.punctuation and token not in ["..", "...", "....", "....."]:
            if not re.match(r"'\w+", token) and not re.match(r"n't", token):
                stopunct_tokens.append(token)

  # Apply lemmatization
  lemma_tokens = []
  for token in stopunct_tokens:
    lemma_tokens.append(lemmatizer.lemmatize(token))

  # Make one string again
  text_processed = ' '.join(lemma_tokens)

  return text_processed

In [5]:
# Test sample text on my_preprocessor
sample_tw = tw_df.sample(10).text.values
for tweet in sample_tw:
    print(my_preprocessor(tweet))

time difference mean missed happy birthday anyway
use opera
creeping bed miss poppy pjs
running suck chat
awe man tan
ewwww perforated eardrummm dr appt later ughhh
mariner game awesome bad lost
nine arm suck god mean
miss year meet soon thought day milkado like pocky
honey bee house cross-pollinating


In [6]:
# Run tweets on my_preprocessor (MAY TAKE UP TO A MINUTE)
tweets = tw_df["text"][:50000]
labels = tw_df["target"][:50000]
processed_tweets = [my_preprocessor(tweet) for tweet in tweets]


# TF-IDF (with Naive Bayes and Random Forest)

In [7]:
proc_tfidf = processed_tweets[:50000]
labels_tfidf = labels[:50000]

# Split the data
tweets_train, tweets_test, y_train, y_test = train_test_split(proc_tfidf, labels_tfidf, test_size=0.2, random_state=10)

# Apply TF-IDF
vect_tfidf = TfidfVectorizer(use_idf=True)
matr_tfidf_train = vect_tfidf.fit_transform(tweets_train)
matr_tfidf_test  = vect_tfidf.transform(tweets_test)

print(matr_tfidf_train.todense())
print('\n')
print(matr_tfidf_test.todense())



[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Naive Bayes

In [8]:
model = MultinomialNB()
model.fit(matr_tfidf_train, y_train)
model_test = model.predict(matr_tfidf_test)
print(f'TF-IDF on Naive Bayes gives {accuracy_score(y_test, model_test)*100:.2f}%')

TF-IDF on Naive Bayes gives 73.40%


## Random Forest

In [9]:
model = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
model.fit(matr_tfidf_train, y_train)
model_test = model.predict(matr_tfidf_test)
print(f'TF-IDF on Random Forest gives {accuracy_score(y_test, model_test)*100:.2f}%')

TF-IDF on Random Forest gives 73.24%


# Word Embedding (with Naive Bayes and Random Forest)

In [10]:
proc_we = processed_tweets[:50000]
labels_we = labels[:50000]

# Download pre-trained model
pretained_word_embedding_models = list(gensim_api.info()['models'].keys())
word_embedding_model = gensim_api.load('glove-wiki-gigaword-300')



In [11]:
def embed_tweet(tweet):
  words = tweet.split() # Already tokenized via my_preprocessor
  word_vectors = []

  for word in words:
    if word in word_embedding_model:
      word_vectors.append(word_embedding_model[word])

  if len(word_vectors) == 0:
    return(np.zeros(300))

  return np.mean(word_vectors, axis = 0)

embedded_tweets = []
for tweet in proc_we:
  embedded_tweets.append(embed_tweet(tweet))

embedded_tweets = np.array(embedded_tweets)

In [12]:
tweets_train, tweets_test, y_train, y_test = train_test_split(embedded_tweets, labels_we, test_size=0.2, random_state=10)

## Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

# model = MultinomialNB()
model.fit(tweets_train, y_train)
model_test = model.predict(tweets_test)
print(f'Word Embedding on Naive Bayes gives {accuracy_score(y_test, model_test)*100:.2f}%')

Word Embedding on Naive Bayes gives 60.25%


## Random Forest

In [14]:
model = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
model.fit(tweets_train, y_train)
model_test = model.predict(tweets_test)
print(f'Word Embedding on Random Forest gives {accuracy_score(y_test, model_test)*100:.2f}%')

Word Embedding on Random Forest gives 69.39%
