# Krystian Gronek & Katarzyna Piotrowska
# Text Mining and Social Media Mining, final project - Analyzing men and women comments using NLP methods

# Loading packages and data

In [1]:
%matplotlib inline 

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

men = pd.read_csv('data/final_askmen.csv', sep = ';')
women = pd.read_csv('data/final_askwomen.csv', sep = ';')

In [2]:
# add categorical variable that distincs weather a observation comes from /r/AskMen subreddit or /r/AskWomen subreddit 
men['subreddit'] = np.repeat("askmen", len(men))
women['subreddit'] = np.repeat("askwomen", len(women))

# merge two datasets into one
df = pd.concat([men, women], axis = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30809 entries, 0 to 14637
Data columns (total 23 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   username                          30809 non-null  object 
 1   com_original                      30809 non-null  object 
 2   cleaned                           30809 non-null  object 
 3   cleaned_wo_sw                     30809 non-null  object 
 4   tokenized                         30809 non-null  object 
 5   stemmed                           30809 non-null  object 
 6   tokenized_wo_sw                   30809 non-null  object 
 7   submission_title                  30809 non-null  object 
 8   submission_title_cleaned          30809 non-null  object 
 9   submission_title_cleaned_wo_sw    30809 non-null  object 
 10  submission_title_tokenized        30809 non-null  object 
 11  submission_title_stemmed          30809 non-null  object 
 12  subm

# Categorization of comments according to subreddit 

In [3]:
comments_cleaned = df['cleaned']
subreddits = df['subreddit']

# Vectorization
# count how many times does a word occur in each message, term frequency
# weigh the counts, so that frequent tokens get lower weight, inverse document frequency
# normalize the vectors to unit length, to abstract from the original text length, L2 norm 
cv = CountVectorizer().fit(comments_cleaned);
X = cv.transform(comments_cleaned);

# the bag-of-words counts for the entire SMS corpus
print('Shape of Sparse Matrix: ',X.shape)
print('Amount of non-zero occurences:',X.nnz)

# Sparsity
sparsity =(100.0 * X.nnz/(X.shape[0]*X.shape[1]))
print('Sparsity: {}'.format(round(sparsity)),"%")
# THE SPARSITY IS equal to 0 % which is good.

# Term weighting and normalization with TF-IDF
tfidf_transformer=TfidfTransformer().fit(X)
X_tfidf = tfidf_transformer.transform(X)
print(tfidf_transformer)
print(X_tfidf.shape)

# split the data into train and test parts
comment_train, comment_test, subreddit_train, subreddit_test = train_test_split(X_tfidf, subreddits, test_size=0.2, random_state = 9);

# Naive Bayes Classifier
model = MultinomialNB();
model.fit(comment_train,subreddit_train);
model.score(comment_test,subreddit_test);

# what is the quality of our model?
all_predictions = model.predict(X_tfidf)
print(all_predictions)

# Accuracy of our Model - train data
print("Accuracy of Model - train data", model.score(comment_train,subreddit_train)*100,"%")

# Accuracy of our Model - test data
print("Accuracy of Model - test data", model.score(comment_test,subreddit_test)*100,"%")

Shape of Sparse Matrix:  (30809, 30197)
Amount of non-zero occurences: 781735
Sparsity: 0 %
TfidfTransformer()
(30809, 30197)


MultinomialNB()

0.776209023044466

['askmen' 'askmen' 'askmen' ... 'askwomen' 'askwomen' 'askwomen']
Accuracy of Model - train data 86.61906114334403 %
Accuracy of Model - test data 77.6209023044466 %


In [4]:
df

Unnamed: 0,username,com_original,cleaned,cleaned_wo_sw,tokenized,stemmed,tokenized_wo_sw,submission_title,submission_title_cleaned,submission_title_cleaned_wo_sw,...,comment_score,submission_ups,minmax,minmax_grouped,sentiment,comments_polarity,comments_predicted_sentiment,overestimated,underestimated,subreddit
0,8483,Thank fuck... So many great posts buried under...,thank fuck so many great posts buried under id...,thank fuck many great posts buried idiotic kar...,"['thank', 'fuck', 'so', 'many', 'great', 'post...",thank fuck mani great post buri idiot karma wh...,"['thank', 'fuck', 'many', 'great', 'posts', 'b...",BONK! Overly sexual questions are no longer al...,bonk overly sexual questions are no longer all...,bonk overly sexual questions longer allowed,...,11,13949,0.535714,0.235294,positive,-0.0571,negative,0,1,askmen
1,Zeezprahh,"Well fuck me and suck me sideways, it's a deal!",well fuck me and suck me sideways its a deal,well fuck suck sideways deal,"['well', 'fuck', 'me', 'and', 'suck', 'me', 's...",well fuck suck sideway deal,"['well', 'fuck', 'suck', 'sideways', 'deal']",BONK! Overly sexual questions are no longer al...,bonk overly sexual questions are no longer all...,bonk overly sexual questions longer allowed,...,8,13949,0.428571,0.058824,positive,-0.6486,negative,0,1,askmen
2,skinny_gator,I'm dying over here lmao\n\nThis is amazing. A...,im dying over here lmao this is amazing and ye...,im dying lmao amazing yes ask men straight bec...,"['im', 'dying', 'over', 'here', 'lmao', 'this'...",im die lmao amaz ye ask men straight becom nsf...,"['im', 'dying', 'lmao', 'amazing', 'yes', 'ask...",BONK! Overly sexual questions are no longer al...,bonk overly sexual questions are no longer all...,bonk overly sexual questions longer allowed,...,18,13949,0.785714,0.647059,positive,0.9062,positive,0,0,askmen
3,dyslexicbunny,Oh good. I got tired of reading all that crap....,oh good i got tired of reading all that crap l...,oh good got tired reading crap lets talk comfy...,"['oh', 'good', 'i', 'got', 'tired', 'of', 'rea...",oh good got tire read crap let talk comfi hell...,"['oh', 'good', 'got', 'tired', 'reading', 'cra...",BONK! Overly sexual questions are no longer al...,bonk overly sexual questions are no longer all...,bonk overly sexual questions longer allowed,...,19,13949,0.821429,0.705882,positive,-0.8020,negative,0,1,askmen
4,BantyRed,I thought it was just me. I joined right befor...,i thought it was just me i joined right before...,thought joined right horny came figured horny ...,"['i', 'thought', 'it', 'was', 'just', 'me', 'i...",thought join right horni came figur horni peopl,"['thought', 'joined', 'right', 'horny', 'came'...",BONK! Overly sexual questions are no longer al...,bonk overly sexual questions are no longer all...,bonk overly sexual questions longer allowed,...,6,13949,0.357143,-0.058824,positive,0.0000,negative,0,1,askmen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14633,nocturnal_confidant,\nI understand the question. But I am going tr...,i understand the question but i am going try t...,understand question going try longer compare r...,"['i', 'understand', 'the', 'question', 'but', ...",understand question go tri longer compar right...,"['understand', 'question', 'going', 'try', 'lo...",What age do you consider to be “in your prime”?,what age do you consider to be in your prime,age consider prime,...,1,373,-0.076923,-0.785714,negative,-0.3940,negative,0,0,askwomen
14634,D-Spornak,I'm hoping it's coming up soon.,im hoping its coming up soon,im hoping coming soon,"['im', 'hoping', 'its', 'coming', 'up', 'soon']",im hope come soon,"['im', 'hoping', 'coming', 'soon']",What age do you consider to be “in your prime”?,what age do you consider to be in your prime,age consider prime,...,1,373,-0.076923,-0.785714,negative,0.4215,positive,1,0,askwomen
14635,Non-Priority-98,Hmmmm I would think that it is in stable times...,hmmmm i would think that it is in stable times...,hmmmm would think stable times short substanti...,"['hmmmm', 'i', 'would', 'think', 'that', 'it',...",hmmmm would think stabl time short substanti e...,"['hmmmm', 'would', 'think', 'stable', 'times',...",What age do you consider to be “in your prime”?,what age do you consider to be in your prime,age consider prime,...,1,373,-0.076923,-0.785714,negative,0.9136,positive,1,0,askwomen
14636,Irinakusx,20 it is advisable to take care of your health...,it is advisable to take care of your health at...,advisable take care health time always active ...,"['it', 'is', 'advisable', 'to', 'take', 'care'...",advis take care health time alway activ desir,"['advisable', 'take', 'care', 'health', 'time'...",What age do you consider to be “in your prime”?,what age do you consider to be in your prime,age consider prime,...,1,373,-0.076923,-0.785714,negative,0.8020,positive,1,0,askwomen


In [5]:
print(classification_report(df['subreddit'],all_predictions))
print(confusion_matrix(df['subreddit'],all_predictions))

              precision    recall  f1-score   support

      askmen       0.85      0.86      0.86     16171
    askwomen       0.85      0.83      0.84     14638

    accuracy                           0.85     30809
   macro avg       0.85      0.85      0.85     30809
weighted avg       0.85      0.85      0.85     30809

[[13942  2229]
 [ 2448 12190]]


# Predicting from which subreddit does a post come from

In [6]:
sample_verse2 = ["Fear thou not; for I am with thee: be not dismayed; for I am thy God: I will strengthen thee; yea, I will help thee; yea, I will uphold thee with the right hand of my righteousness."]
vect2 = cv.transform(sample_verse2).toarray()
model.predict(vect2)


array(['askmen'], dtype='<U8')

In [7]:
#from sklearn.externals import joblib
#biblepredictionNV_model = open("biblepredictionNV_model.pkl","wb")
#joblib.dump(clf,biblepredictionNV_model)
#biblepredictionNV_model.close()
