In [1]:
import pandas as pd
import numpy as np
import string
import re

# NLTK Imports
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
# from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.base import BaseEstimator, TransformerMixin 

In [2]:
# Loading in dataset
df = pd.read_csv('data/sentiment_tweets3.csv')

In [3]:
# Previewing dataset
df

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
...,...,...,...
10309,802309,No Depression by G Herbo is my mood from now o...,1
10310,802310,What do you do when depression succumbs the br...,1
10311,802311,Ketamine Nasal Spray Shows Promise Against Dep...,1
10312,802312,dont mistake a bad day with depression! everyo...,1


In [4]:
# Dropping last row because the entry is empty
df = df.drop(10313)

In [5]:
# Dropping index column
df = df.drop(columns = ['Index'], axis = 1)

In [6]:
# Renaming columns
df = df.rename(columns = {'message to examine':'tweet', 'label (depression result)': 'depression'})

In [7]:
# Looking at missing values
df.isna().sum()

tweet         0
depression    0
dtype: int64

In [8]:
# Looking at distribution of target
df['depression'].value_counts(normalize = True)

0    0.77572
1    0.22428
Name: depression, dtype: float64

In [9]:
# Train test split
X = df['tweet']
y = df['depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [10]:
X_train

7152    the of end the new JONAS was amazing... aw :'(...
6849    is on a secret mission......  sshh  don't tell...
4047             @jdjacinto smoking lang. relaks, dearie 
6025    Was just woken up by home bible sellers. I don...
2913    Ashton Kutchner (or however it's spelt) 'tweet...
                              ...                        
5734                  Off work! going to watch twilight! 
5191                     asta a fost un examen pe cinste 
5390                        @frozenblueeyes my pleasure! 
860     Shopping for brunch in the pouring rain. I lov...
7270      Bout to go to my psychology exam! Wish me luck 
Name: tweet, Length: 7734, dtype: object

### Creating a Stemming and Tokenizing Function

In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language="english")
stopwords_list = stopwords.words('english')


from nltk.tokenize import RegexpTokenizer
basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)

In [12]:
def stem_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [stemmer.stem(token) for token in tokens]

In [13]:
stemmed_stopwords = [stemmer.stem(word) for word in stopwords_list]

### Count Vectorizer and Random Forest

In [14]:
countvectorizer = CountVectorizer(
                                  stop_words=stemmed_stopwords, 
                                  tokenizer=stem_and_tokenize)

In [15]:
X_train_count_vec = countvectorizer.fit_transform(X_train)

In [16]:
# Visually inspect the vectorized train data
# pd.DataFrame.sparse.from_spmatrix(X_train_count_vec, columns=countvectorizer.get_feature_names())

In [17]:
X_test_count_vec = countvectorizer.transform(X_test)

In [18]:
# Visually inspect the vectorized test data
# pd.DataFrame.sparse.from_spmatrix(X_test_count_vec, columns=countvectorizer.get_feature_names())

In [19]:
# Instantiate and fit a Random Forest model
rfc = RandomForestClassifier(random_state = 42)
rfc.fit(X_train_count_vec, y_train)

RandomForestClassifier(random_state=42)

In [20]:
y_preds_rfc = rfc.predict(X_test_count_vec)

In [21]:
recall_score(y_test, y_preds_rfc)

0.9882154882154882

In [22]:
# rfc.score(X_test_count_vec, y_test)

### TF-IDF Vectorizer and Random Forest

In [23]:
# Instantiate the vectorizer
tfidf = TfidfVectorizer(
    stop_words=stemmed_stopwords,
    tokenizer=stem_and_tokenize
)

In [24]:
# Fit the vectorizer on X_train and transform it
X_train_tfidf_vec = tfidf.fit_transform(X_train)

In [25]:
# Visually inspect the vectorized train data
# pd.DataFrame.sparse.from_spmatrix(X_train_tfidf_vec, columns=tfidf.get_feature_names())

In [26]:
# Transform the test data
X_test_tfidf_vec = tfidf.transform(X_test)

In [27]:
# Visually inspect the vectorized test data
# pd.DataFrame.sparse.from_spmatrix(X_test_tfidf_vec, columns=tfidf.get_feature_names())

In [28]:
# from sklearn.model_selection import cross_val_score
# from sklearn.naive_bayes import MultinomialNB


# # Instantiate a MultinomialNB classifier
# baseline_model = MultinomialNB()
# stemmed_cv = cross_val_score(baseline_model, X_train_vectorized, y_train)
# stemmed_cv

In [29]:
# Code here to instantiate and fit a Random Forest model
rfc_2 = RandomForestClassifier(random_state = 42)
rfc_2.fit(X_train_tfidf_vec, y_train)

RandomForestClassifier(random_state=42)

In [30]:
y_preds_rfc2 = rfc_2.predict(X_test_tfidf_vec)

In [31]:
recall_score(y_test, y_preds_rfc2)

0.98989898989899

In [32]:
# rfc_2.score(X_test_tfidf_vec, y_test)