In [344]:
import pandas as pd
import numpy as np
import string
import re

# NLTK Imports
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score


In [345]:
# Loading in dataset
df = pd.read_csv('data/sentiment_tweets3.csv')

In [346]:
# Previewing dataset
df

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
...,...,...,...
10309,802309,No Depression by G Herbo is my mood from now o...,1
10310,802310,What do you do when depression succumbs the br...,1
10311,802311,Ketamine Nasal Spray Shows Promise Against Dep...,1
10312,802312,dont mistake a bad day with depression! everyo...,1


In [347]:
# Dropping last row because the entry is empty
df = df.drop(10313)

In [348]:
# Dropping index column
df = df.drop(columns = ['Index'], axis = 1)

In [349]:
# Renaming columns
df = df.rename(columns = {'message to examine':'tweet', 'label (depression result)': 'depression'})

In [350]:
# Looking at missing values
df.isna().sum()

tweet         0
depression    0
dtype: int64

In [351]:
# Looking at distribution of target
df['depression'].value_counts(normalize = True)

0    0.77572
1    0.22428
Name: depression, dtype: float64

In [352]:
# Train test split
X = df['tweet']
y = df['depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [353]:
# stopwords_list = stopwords.words('english')

In [354]:
# stopwords_list += list(string.punctuation)

In [355]:
# stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [356]:
# class TextPreprocessor(BaseEstimator, TransformerMixin):
    
#     def __init__(self):
        
#         pass
    
#     def fit(self, data, y = 0):
        
#         return self
    
#     def transform(self, data, y = 0):
#         fully_normalized_corpus = data.apply(self.process_doc)
        
#         return fully_normalized_corpus
        
    
#     def process_doc(self, doc):

#         #initialize lemmatizer
#         wnl = WordNetLemmatizer()
#         stop_words = stopwords.words('english')
        
#         # helper function to change nltk's part of speech tagging to a wordnet format.
#         def pos_tagger(nltk_tag):
#             if nltk_tag.startswith('J'):
#                 return wordnet.ADJ
#             elif nltk_tag.startswith('V'):
#                 return wordnet.VERB
#             elif nltk_tag.startswith('N'):
#                 return wordnet.NOUN
#             elif nltk_tag.startswith('R'):
#                 return wordnet.ADV
#             else:         
#                 return None


#         # remove stop words and punctuations, then lower case
#         doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words)) ]

#         # creates list of tuples with tokens and POS tags in wordnet format
#         wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
#         doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]

#         return " ".join(doc_norm)

In [357]:
# proc = TextPreprocessor()

In [358]:
# transformed_train = proc.fit_transform(X_train) 

In [359]:
# prc_steps = [('countvec', CountVectorizer())]
# preprocess_pipeline = Pipeline(prc_steps)

In [360]:
# X_tr_proc = preprocess_pipeline.fit_transform(transformed_train)

In [361]:
# X_tr_proc

In [362]:
# feat_names = preprocess_pipeline[
#     'countvec'].get_feature_names()


In [363]:
# df_pre = pd.DataFrame(X_tr_proc.toarray(), columns = feat_names)
# X_train = pd.DataFrame(X_train)
# X_test = pd.DataFrame(X_test)

In [364]:
# X_train['tweet'] = X_train['tweet'].str.lower()

In [365]:
# basic_token_pattern = r"(?u)\b\w\w+\b"

# tokenizer = RegexpTokenizer(basic_token_pattern)

# X_train['tweet'] = X_train['tweet'].apply(tokenizer.tokenize)

In [366]:
# stopwords_list = stopwords.words('english')

In [367]:
# stopwords_list

In [368]:
# def remove_stopwords(token_list):
    
#     stopwords_removed = [token for token in token_list if token not in stopwords_list]
#     return stopwords_removed


In [369]:
# X_train['tokenized_without_stopwords'] = X_train['tweet_tokenized'].apply(remove_stopwords)
# X_train['tweet'] = X_train['tweet'].apply(remove_stopwords)

In [370]:
X_train

7152    the of end the new JONAS was amazing... aw :'(...
6849    is on a secret mission......  sshh  don't tell...
4047             @jdjacinto smoking lang. relaks, dearie 
6025    Was just woken up by home bible sellers. I don...
2913    Ashton Kutchner (or however it's spelt) 'tweet...
                              ...                        
5734                  Off work! going to watch twilight! 
5191                     asta a fost un examen pe cinste 
5390                        @frozenblueeyes my pleasure! 
860     Shopping for brunch in the pouring rain. I lov...
7270      Bout to go to my psychology exam! Wish me luck 
Name: tweet, Length: 7734, dtype: object

In [371]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language="english")

In [372]:
def stem_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [stemmer.stem(token) for token in tokens]

In [373]:
stemmed_stopwords = [stemmer.stem(word) for word in stopwords_list]

In [381]:
stemmed_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'our',
 'ourselv',
 'you',
 "you'r",
 "you'v",
 "you'll",
 "you'd",
 'your',
 'your',
 'yourself',
 'yourselv',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'she',
 'her',
 'her',
 'herself',
 'it',
 'it',
 'it',
 'itself',
 'they',
 'them',
 'their',
 'their',
 'themselv',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'be',
 'have',
 'has',
 'had',
 'have',
 'do',
 'doe',
 'did',
 'do',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'becaus',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'dure',
 'befor',
 'after',
 'abov',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'onc',
 'here',
 'there',
 'when',
 'where',
 'whi',
 'how',
 'all',
 'ani',
 'both',
 'each',
 'few',
 'more',
 'most',
 'o

In [375]:
# Replace None with appropriate code

# Instantiate the vectorizer
tfidf = TfidfVectorizer(
    max_features=50,
    stop_words=stemmed_stopwords,
    tokenizer=stem_and_tokenize
)

# Fit the vectorizer on X_train["text"] and transform it
X_train_vectorized = tfidf.fit_transform(X_train)

# Visually inspect the vectorized data
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

Unnamed: 0,amp,anxieti,back,com,come,day,depress,emoji,face,feel,...,thing,think,time,today,twitter,want,watch,well,work,www
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.625069,0.0,0.597482,0.0
7730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
7731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
7732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.598932,0.0,0.0,0.000000,0.0,0.000000,0.0


In [376]:
X_test_vectorized = tfidf.transform(X_test)

In [377]:
pd.DataFrame.sparse.from_spmatrix(X_test_vectorized, columns=tfidf.get_feature_names())

Unnamed: 0,amp,anxieti,back,com,come,day,depress,emoji,face,feel,...,thing,think,time,today,twitter,want,watch,well,work,www
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.519358,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.069913,0.684985,0.717831,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2574,0.0,0.0,0.0,0.0,0.0,0.000000,0.296339,0.000000,0.608533,0.541496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2575,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2576,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2577,0.0,0.0,0.0,0.0,0.0,0.579577,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [378]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB


# Instantiate a MultinomialNB classifier
baseline_model = MultinomialNB()
stemmed_cv = cross_val_score(baseline_model, X_train_vectorized, y_train)
stemmed_cv

array([0.98060763, 0.98125404, 0.97220427, 0.96832579, 0.97606727])

In [379]:
from sklearn.ensemble import RandomForestClassifier

# Code here to instantiate and fit a Random Forest model
rfc = RandomForestClassifier()
rfc.fit(X_train_vectorized, y_train)

RandomForestClassifier()

In [380]:
rfc.score(X_test_vectorized, y_test)

0.9957347809228383

In [None]:
# # Import the relevant vectorizer class
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Instantiate a vectorizer with max_features=10
# # (we are using the default token pattern)
# tfidf = TfidfVectorizer(max_features=50)

# # Fit the vectorizer on X_train["text"] and transform it
# X_train_tfidf = tfidf.fit_transform(X_train['tweet'])

# # Visually inspect the vectorized data
# pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())
tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stemmed_stopwords,
    tokenizer=stem_and_tokenize
)

# Fit the vectorizer on X_train["text"] and transform it
X_train_vectorized = tfidf.fit_transform(X_train["text"])

# Visually inspect the vectorized data
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
countvectorizer = CountVectorizer()
countvectorizer.fit(X_train['tweet'])

In [None]:
X_train_vec = countvectorizer.transform(X_train)
X_test_vec = countvectorizer.transform(X_test)

In [None]:
X_train_vec

In [None]:
# Importing the classifier...
from sklearn.ensemble import RandomForestClassifier

# Code here to instantiate and fit a Random Forest model
rfc = RandomForestClassifier()
rfc.fit(X_train_vec, y_train)

In [None]:
# tfidf = TfidfVectorizer(
#     max_features=10,
#     stop_words=stopwords_list
# )

# # Fit the vectorizer on X_train["text"] and transform it
# X_train_vectorized = tfidf.fit_transform(X_train)

# # Visually inspect the vectorized data
# pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())