In [1]:
import requests
import pandas as pd
import random
import nltk
import regex as re
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [2]:
diet = pd.read_csv('diet.csv')
love = pd.read_csv('love.csv')

# adding the labels column
diet['label'] = 'diet'
love['label'] = 'love'

# joining the data sets
data = pd.concat([diet,love],sort=False)

# using the useful columns only
data = data[['selftext','label']]

data.dropna(subset=['selftext'], inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1818 entries, 0 to 961
Data columns (total 2 columns):
selftext    1818 non-null object
label       1818 non-null object
dtypes: object(2)
memory usage: 42.6+ KB


In [3]:
data['label'] = data['label'].map({'diet':0,'love':1})
data.head()

Unnamed: 0,selftext,label
0,Hi I’m a female (22 year old) i wouldn’t call ...,0
1,"Hi,\n\nI'm an 18-year-old female. When I was 1...",0
2,I know a lot of losing weight is what you eat ...,0
3,Here some backstory. I’m 17 and about 100 poun...,0
5,I just can never get satisfied until i feel full,0


In [4]:
# remove line splitters
data['selftext'].replace(to_replace='\n',value=' ',inplace=True,regex=True)
data['selftext'].replace(to_replace='/',value=' ',inplace=True,regex=True)

# change to lower case
data['selftext'] = data['selftext'].str.lower()

# remove punctuation
data['selftext'].replace(to_replace=r'[^\w\s]',value='',regex=True,inplace=True)

In [5]:
# tokenize the reddit comments
data['selftext'] = data['selftext'].apply(word_tokenize)

In [6]:
#remove stop words
add = ['im','hi','ive','hey','wouldnt','love','diet']
to_remove = (stopwords.words("english"))+add
data['selftext'] = data['selftext'].apply(lambda x: [item for item in x if item not in to_remove])

In [25]:
print(to_remove)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
#joining the tokenized words to strings
data['selftext']=data['selftext'].apply(' '.join)
data['selftext'].head()

0    female 22 year old call overweight 54 kg heigh...
1    18yearold female 15 half started counting calo...
2    know lot losing weight eat much exactly sure e...
3    backstory 17 100 pounds overweight currently g...
5                        never get satisfied feel full
Name: selftext, dtype: object

In [8]:
# lemmatize words
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

data['selftext'] = data['selftext'].apply(lemmatize_text)
data

Unnamed: 0,selftext,label
0,"[female, 22, year, old, call, overweight, 54, ...",0
1,"[18yearold, female, 15, half, started, countin...",0
2,"[know, lot, losing, weight, eat, much, exactly...",0
3,"[backstory, 17, 100, pound, overweight, curren...",0
5,"[never, get, satisfied, feel, full]",0
6,"[barely, underweight, looking, good, help, pac...",0
7,"[small, change, made, really, easy, stick, rea...",0
8,"[eating, healthy, 6, month, gained, around, 45...",0
11,"[17f, wanting, change, there, point, eating, c...",0
12,"[beginning, involves, counting, calorie, using...",0


In [9]:
# calculate baseline accuracy
data['label'].value_counts(normalize=True)

0    0.514851
1    0.485149
Name: label, dtype: float64

In [10]:
#joining the tokenized words to strings
data['selftext']=data['selftext'].apply(' '.join)

In [11]:
# top 20 words in 'diet' based on frequency
data[data['label'] == 0].selftext.str.split(expand=True).stack().value_counts().head(20)

eat        727
weight     692
day        687
food       474
like       461
eating     459
calorie    412
meal       363
get        363
time       341
fat        332
dont       327
week       322
would      293
help       281
know       278
lose       263
also       260
want       260
much       260
dtype: int64

In [12]:
# top 20 words in 'love' based on frequency
data[data['label'] == 1].selftext.str.split(expand=True).stack().value_counts().head(20)

like       1043
know        903
time        802
feel        746
dont        731
want        724
one         617
never       604
thing       523
life        512
day         501
would       485
make        473
much        470
even        457
get         455
friend      453
feeling     452
really      449
someone     426
dtype: int64

In [13]:
X = data['selftext']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, shuffle = True)

In [14]:
# using Tfidf and Naive Bayes Multinomial Classifier
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('cls', MultinomialNB())
])

In [15]:
# obtaining the best values for Tfidf vertorizer
pipe_params = {
    'tvec__max_features': [2500, 3000, 3500],
    'tvec__min_df': [0.02, 0.04,0.06],
    'tvec__max_df': [.9, .95],
    'tvec__ngram_range': [(1,1), (1,2),(1,3),(1,4)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.9757887013939839


{'tvec__max_df': 0.9,
 'tvec__max_features': 2500,
 'tvec__min_df': 0.02,
 'tvec__ngram_range': (1, 1)}

In [16]:
gs.score(X_train,y_train)

0.97505502567865

In [17]:
gs.score(X_test,y_test)

0.967032967032967

In [29]:
# using count vectorizer and logistic regression
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver='liblinear'))
])

In [30]:
# obtaining the best values for count vectorizer
pipe_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [0.02, 0.04,0.06],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2),(1,3),(1,4)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.9581804842259721


{'cvec__max_df': 0.9,
 'cvec__max_features': 2500,
 'cvec__min_df': 0.02,
 'cvec__ngram_range': (1, 1)}

In [31]:
gs.score(X_train,y_train)

0.9904622157006603

In [32]:
gs.score(X_test,y_test)

0.9538461538461539