In [159]:
import pandas as pd
import numpy as np
import string
import re

# NLTK Imports
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin


In [160]:
# Loading in dataset
df = pd.read_csv('data/sentiment_tweets3.csv')

In [161]:
# Previewing dataset
df

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
...,...,...,...
10309,802309,No Depression by G Herbo is my mood from now o...,1
10310,802310,What do you do when depression succumbs the br...,1
10311,802311,Ketamine Nasal Spray Shows Promise Against Dep...,1
10312,802312,dont mistake a bad day with depression! everyo...,1


In [162]:
# Dropping last row because the entry is empty
df = df.drop(10313)

In [163]:
# Dropping index column
df = df.drop(columns = ['Index'], axis = 1)

In [164]:
# Renaming columns
df = df.rename(columns = {'message to examine':'tweet', 'label (depression result)': 'depression'})

In [165]:
# Looking at missing values
df.isna().sum()

tweet         0
depression    0
dtype: int64

In [166]:
# Looking at distribution of target
df['depression'].value_counts(normalize = True)

0    0.77572
1    0.22428
Name: depression, dtype: float64

In [167]:
# Train test split
X = df['tweet']
y = df['depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [168]:
# stopwords_list = stopwords.words('english')

In [169]:
# stopwords_list += list(string.punctuation)

In [170]:
# stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [171]:
# class TextPreprocessor(BaseEstimator, TransformerMixin):
    
#     def __init__(self):
        
#         pass
    
#     def fit(self, data, y = 0):
        
#         return self
    
#     def transform(self, data, y = 0):
#         fully_normalized_corpus = data.apply(self.process_doc)
        
#         return fully_normalized_corpus
        
    
#     def process_doc(self, doc):

#         #initialize lemmatizer
#         wnl = WordNetLemmatizer()
#         stop_words = stopwords.words('english')
        
#         # helper function to change nltk's part of speech tagging to a wordnet format.
#         def pos_tagger(nltk_tag):
#             if nltk_tag.startswith('J'):
#                 return wordnet.ADJ
#             elif nltk_tag.startswith('V'):
#                 return wordnet.VERB
#             elif nltk_tag.startswith('N'):
#                 return wordnet.NOUN
#             elif nltk_tag.startswith('R'):
#                 return wordnet.ADV
#             else:         
#                 return None


#         # remove stop words and punctuations, then lower case
#         doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words)) ]

#         # creates list of tuples with tokens and POS tags in wordnet format
#         wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
#         doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]

#         return " ".join(doc_norm)

In [172]:
# proc = TextPreprocessor()

In [173]:
# transformed_train = proc.fit_transform(X_train) 

In [174]:
# prc_steps = [('countvec', CountVectorizer())]
# preprocess_pipeline = Pipeline(prc_steps)

In [175]:
# X_tr_proc = preprocess_pipeline.fit_transform(transformed_train)

In [176]:
# X_tr_proc

In [177]:
# feat_names = preprocess_pipeline[
#     'countvec'].get_feature_names()


In [178]:
# df_pre = pd.DataFrame(X_tr_proc.toarray(), columns = feat_names)


In [179]:
X_train = X_train.str.lower()

In [180]:
X_train

7152    the of end the new jonas was amazing... aw :'(...
6849    is on a secret mission......  sshh  don't tell...
4047             @jdjacinto smoking lang. relaks, dearie 
6025    was just woken up by home bible sellers. i don...
2913    ashton kutchner (or however it's spelt) 'tweet...
                              ...                        
5734                  off work! going to watch twilight! 
5191                     asta a fost un examen pe cinste 
5390                        @frozenblueeyes my pleasure! 
860     shopping for brunch in the pouring rain. i lov...
7270      bout to go to my psychology exam! wish me luck 
Name: tweet, Length: 7734, dtype: object

In [181]:
basic_token_pattern = r"(?u)\b\w\w+\b"

tokenizer = RegexpTokenizer(basic_token_pattern)

X_train = X_train.apply(tokenizer.tokenize)

In [182]:
X_train

7152    [the, of, end, the, new, jonas, was, amazing, ...
6849    [is, on, secret, mission, sshh, don, tell, the...
4047           [jdjacinto, smoking, lang, relaks, dearie]
6025    [was, just, woken, up, by, home, bible, seller...
2913    [ashton, kutchner, or, however, it, spelt, twe...
                              ...                        
5734              [off, work, going, to, watch, twilight]
5191                 [asta, fost, un, examen, pe, cinste]
5390                       [frozenblueeyes, my, pleasure]
860     [shopping, for, brunch, in, the, pouring, rain...
7270    [bout, to, go, to, my, psychology, exam, wish,...
Name: tweet, Length: 7734, dtype: object

In [183]:
stopwords_list = stopwords.words('english')

In [184]:
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [185]:
def remove_stopwords(token_list):
    
    stopwords_removed = [token for token in token_list if token not in stopwords_list]
    return stopwords_removed


In [186]:
X_train = X_train.apply(remove_stopwords)

In [187]:
X_train = pd.DataFrame(X_train)

In [188]:
X_train

Unnamed: 0,tweet
7152,"[end, new, jonas, amazing, aw, yep, dats, tear..."
6849,"[secret, mission, sshh, tell, kids]"
4047,"[jdjacinto, smoking, lang, relaks, dearie]"
6025,"[woken, home, bible, sellers, need, bible, 10,..."
2913,"[ashton, kutchner, however, spelt, tweets, muc..."
...,...
5734,"[work, going, watch, twilight]"
5191,"[asta, fost, un, examen, pe, cinste]"
5390,"[frozenblueeyes, pleasure]"
860,"[shopping, brunch, pouring, rain, love, gf, wa..."


In [189]:
# Import the relevant vectorizer class
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate a vectorizer with max_features=10
# (we are using the default token pattern)
tfidf = TfidfVectorizer(max_features=10)

# Fit the vectorizer on X_train["text"] and transform it
X_train_vectorized = tfidf.fit_transform(X_train["tweet"])

# Visually inspect the vectorized data
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

AttributeError: 'list' object has no attribute 'lower'

In [190]:
tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stopwords_list
)

# Fit the vectorizer on X_train["text"] and transform it
X_train_vectorized = tfidf.fit_transform(X_train)

# Visually inspect the vectorized data
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

Unnamed: 0,tweet
0,1.0
