# Krystian Gronek & Katarzyna Piotrowska
# Text Mining and Social Media Mining, final project - Analyzing men and women comments using NLP methods

# Loading packages and data

In [66]:
%matplotlib inline 

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

men = pd.read_csv('data/final_askmen.csv', sep = ';')
women = pd.read_csv('data/final_askwomen.csv', sep = ';')

In [67]:
# add categorical variable that distincs weather a observation comes from /r/AskMen subreddit or /r/AskWomen subreddit 
men['subreddit'] = np.repeat("askmen", len(men))
women['subreddit'] = np.repeat("askwomen", len(women))

# merge two datasets into one
df = pd.concat([men, women], axis = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30364 entries, 0 to 14362
Data columns (total 24 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   username                          30364 non-null  object 
 1   com_original                      30364 non-null  object 
 2   cleaned                           30364 non-null  object 
 3   cleaned_wo_sw                     30364 non-null  object 
 4   tokenized                         30364 non-null  object 
 5   stemmed                           30364 non-null  object 
 6   tokenized_wo_sw                   30364 non-null  object 
 7   submission_title                  30364 non-null  object 
 8   submission_title_cleaned          30364 non-null  object 
 9   submission_title_cleaned_wo_sw    30364 non-null  object 
 10  submission_title_tokenized        30364 non-null  object 
 11  submission_title_stemmed          30364 non-null  object 
 12  subm

# Z GITHUBA (usunac to jak juz wybiore)

# Categorization of comments according to subreddit 

In [85]:
comments_cleaned = df['cleaned']
subreddits = df['subreddit']

# Vectorization
# count how many times does a word occur in each message, term frequency
# weigh the counts, so that frequent tokens get lower weight, inverse document frequency
# normalize the vectors to unit length, to abstract from the original text length, L2 norm 
cv = CountVectorizer().fit(comments_cleaned);
X = cv.transform(comments_cleaned);

# the bag-of-words counts for the entire SMS corpus
print('Shape of Sparse Matrix: ',X.shape)
print('Amount of non-zero occurences:',X.nnz)

# Sparsity
sparsity =(100.0 * X.nnz/(X.shape[0]*X.shape[1]))
print('Sparsity: {}'.format(round(sparsity)),"%")
# THE SPARSITY IS equal to 0 % which is good.

# Term weighting and normalization with TF-IDF
tfidf_transformer=TfidfTransformer().fit(X)
X_tfidf = tfidf_transformer.transform(X)
print(tfidf_transformer)
print(X_tfidf.shape)

# split the data into train and test parts
comment_train, comment_test, subreddit_train, subreddit_test = train_test_split(X, subreddits, test_size=0.2, random_state = 9);

# Naive Bayes Classifier
model = MultinomialNB();
model.fit(comment_train,subreddit_train);
model.score(comment_test,subreddit_test);

# what is the quality of our model?
all_predictions = model.predict(X_tfidf)
print(all_predictions)

# Accuracy of our Model - train data
print("Accuracy of Model - train data", model.score(comment_train,subreddit_train)*100,"%")

# Accuracy of our Model - test data
print("Accuracy of Model - test data", model.score(comment_test,subreddit_test)*100,"%")

Shape of Sparse Matrix:  (30364, 29942)
Amount of non-zero occurences: 767382
Sparsity: 0 %
TfidfTransformer()
(30364, 29942)


MultinomialNB()

0.7711180635600198

['askmen' 'askmen' 'askmen' ... 'askwomen' 'askwomen' 'askwomen']
Accuracy of Model - train data 84.41809723766004 %
Accuracy of Model - test data 77.11180635600198 %


In [81]:
print(classification_report(df['subreddit'],all_predictions))
print(confusion_matrix(df['subreddit'],all_predictions))

              precision    recall  f1-score   support

      askmen       0.83      0.87      0.85     16001
    askwomen       0.85      0.81      0.83     14363

    accuracy                           0.84     30364
   macro avg       0.84      0.84      0.84     30364
weighted avg       0.84      0.84      0.84     30364

[[13997  2004]
 [ 2782 11581]]


# Predicting from which subreddit does a comment come from

In [22]:
sample_verse2 = ["Fear thou not; for I am with thee: be not dismayed; for I am thy God: I will strengthen thee; yea, I will help thee; yea, I will uphold thee with the right hand of my righteousness."]
vect2 = cv.transform(sample_verse2).toarray()
clf.predict(vect2)


array(['askmen'], dtype='<U8')

# Saving the Model

In [14]:
#from sklearn.externals import joblib
#biblepredictionNV_model = open("biblepredictionNV_model.pkl","wb")
#joblib.dump(clf,biblepredictionNV_model)
#biblepredictionNV_model.close()


# Z TWD (usunac to jak juz wybiore)