In [44]:
import numpy as np
import pandas as pd
import scipy.sparse

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [45]:
data_frame = pd.read_csv("QuoraWithTopic.csv", encoding='utf-8')
data_frame.head(5)

Unnamed: 0,question,Topic number,Topic desc
0,What is the step by step guide to invest in sh...,4,Finance
1,Why am I mentally very lonely? How can I solve...,6,horoscopes
2,Astrology: I am a Capricorn Sun Cap moon and c...,3,Sport
3,How do I read and find my YouTube comments?,0,Education
4,What Game of Thrones villain would be the most...,7,Environment


In [46]:
# max_df is between 0-1 or an INT
count_vectorizer = CountVectorizer(max_df=0.90, min_df=4, stop_words="english")
doc_term_matrix = count_vectorizer.fit_transform(data_frame["question"])
len(count_vectorizer.get_feature_names())


17135

In [47]:
doc_term_matrix = count_vectorizer.fit_transform(data_frame["question"])
target_topic = data_frame['Topic number']

### Training Data
The Training data consists of **12500** positive reviews and **12500** negative reviews.
### Test Data
The Test data also consists of **2500** positive reviews and **2500** negative reviews.

**NOTE** : The execution may be slow because of the huge dataset for training the text classifier


In [48]:
X_train, X_test, y_train, y_test = train_test_split(doc_term_matrix,target_topic, test_size = 0.3, random_state = 1)

### Classifier 
MultinomialNB classifier is created and used to fit/train the model using the training data

In [49]:
mnc_classifier = MultinomialNB()
mnc_classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Prediction
The Model is now equipped with data to predict a new review is positive or negative. A **positive review** is fed into the classifier to predict.

In [50]:
mnc_model_predictions = mnc_classifier.predict(X_test)

## Confusion Matrix
In order to evaluate our model for the movie review classifer we are going to use Confusion matrix

In [51]:
print(metrics.confusion_matrix(y_test, mnc_model_predictions))

[[5387  241  175  411  227  135  197  157]
 [ 227 5551  182  368  133  230  205  186]
 [ 181  128 6540  333  155  155  176  178]
 [  92  108  146 8550  140  144  253  122]
 [ 156   71   98  208 6341  148  180  133]
 [ 152  152  160  331  166 5984  202  159]
 [ 184  175  170  530  231  262 5590  205]
 [ 123  209  226  250  176  223  177 5215]]


In [52]:
print(metrics.classification_report(y_test, mnc_model_predictions))

              precision    recall  f1-score   support

           0       0.83      0.78      0.80      6930
           1       0.84      0.78      0.81      7082
           2       0.85      0.83      0.84      7846
           3       0.78      0.89      0.83      9555
           4       0.84      0.86      0.85      7335
           5       0.82      0.82      0.82      7306
           6       0.80      0.76      0.78      7347
           7       0.82      0.79      0.81      6599

    accuracy                           0.82     60000
   macro avg       0.82      0.82      0.82     60000
weighted avg       0.82      0.82      0.82     60000

