### Importing libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.externals import joblib

In [190]:
data = pd.read_csv("Data/reddit-top-flairs-cleaned.csv")

In [191]:
data.head()

Unnamed: 0,id,url,title,body,flair,dirty_title,dirty_body,num_words_title,num_words_body,num_unique_words_title,num_unique_words_body,num_chars_title,num_chars_body
0,g2ct57,https://www.reddit.com/r/india/comments/g2ct57...,polite request indians,nt know situation countries india seen lot org...,Politics,A polite request to all Indians here,I don't know if it is the same situation in ot...,3,109,3,83,22,757
1,futac9,https://www.reddit.com/r/india/comments/futac9...,pitting community political party fucking stupid,first let start saying stupid whatever muslims...,Politics,Pitting a community against a political party ...,First of all let me start by saying it was stu...,6,157,6,119,49,1067
2,ff8sth,https://i.redd.it/yjo9wpy38el41.jpg,new political party gave full front page ad po...,nil,Politics,A new political party gave a full front page a...,NIL,12,1,12,1,73,3
3,fpaj1w,https://theprint.in/india/hit-by-backlash-over...,hit backlash posts lack medical gear doctors g...,nil,Politics,Hit by backlash over posts on lack of medical ...,NIL,26,1,23,1,191,3
4,fxs1vy,https://www.timesnownews.com/india/article/pol...,politics time corona wb cm questions centre s ...,nil,Politics,Politics in the time of corona: WB CM question...,NIL,15,1,15,1,84,3


### Dropping unwanted columns and shuffling the data.

In [192]:
data.drop(['dirty_title','dirty_body','num_words_body','num_unique_words_title'
           ,'num_unique_words_body','num_chars_title','num_words_title','num_chars_body','url','id'], axis=1)

data.sample(frac=1)

Unnamed: 0,id,url,title,body,flair,dirty_title,dirty_body,num_words_title,num_words_body,num_unique_words_title,num_unique_words_body,num_chars_title,num_chars_body
366,fk25rj,https://www.reddit.com/r/india/comments/fk25rj...,askindia government put emi payment hold apri...,advent corona virus india money flow severely ...,AskIndia,[AskIndia] Why has the government not put all ...,"With the advent of Corona virus in India, mone...",8,39,8,38,52,257
670,cjnx1s,https://www.reddit.com/r/india/comments/cjnx1s...,query data science courses institutions detail...,hello folks friend query regarding data scienc...,Business/Finance,"A query about Data Science: Courses, Instituti...","Hello folks, a friend had a query regarding Da...",8,101,8,86,66,728
563,aoef6r,https://i.redd.it/p491fl35yaf21.jpg,shimla last night s snowfall mobile mi a2 phot...,nil,Photography,Shimla after last night's snowfall. Mobile - M...,NIL,9,1,9,1,54,3
30,g0y7zg,https://www.reddit.com/r/india/comments/g0y7zg...,random daily discussion thread april 14 2020 0...,beep boop bot i am a bot if any problem askawa...,Politics,"Random Daily Discussion Thread - April 14, 202...","^Beep ^Boop ^Bot, ^I ^am ^a ^bot! ^if ^any ^pr...",8,13,8,12,51,58
49,g3jfil,https://in.news.yahoo.com/indian-tricolour-pro...,indian tricolour projected onto switzerland s ...,nil,Politics,Indian Tricolour Projected onto Switzerland's ...,NIL,11,1,11,1,81,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,d76uvt,https://www.reddit.com/r/india/comments/d76uvt...,economic shit storm incoming brace impact fail...,getting bombarded let say scenario rather pess...,Policy/Economy,Economic Shit Storm Incoming brace for impact ...,"Before getting bombarded, let me say that it i...",12,246,12,182,79,1758
32,f1a6p2,https://theprint.in/opinion/pov/talking-to-cab...,talking cab drivers politics oldest cliché ube...,nil,Politics,Talking to cab drivers about politics is the o...,NIL,9,1,9,1,58,3
40,g1nd9a,https://theprint.in/india/governance/ias-edge-...,ias edge proves winner chief secretaries overs...,nil,Politics,IAS edge proves a winner as chief secretaries ...,NIL,11,1,11,1,81,3
833,7yz12m,https://www.reddit.com/r/india/comments/7yz12m...,need help depression anger,recently got heart broken realised unrequited ...,Science/Technology,Need help with depression and anger.,"Recently got my heart broken, realised it was ...",4,335,4,200,27,2129


In [131]:
top_flairs = ["Politics", "Non-Political", "Coronavirus", "AskIndia", "Policy/Economy", "[R]eddiquette", 
              "Photography", "Business/Finance", "Sports", "Science/Technology"]

### The columns of "title" and "body" are combined as one feature as "input_features" and "cat" stores the output label. Train test split is also done.

In [132]:
input_features = data["title"] + " "+ data["body"]
data = data.assign(input_features = input_features)

y = data.flair

x_train, x_test, y_train, y_test = train_test_split(input_features,y, test_size=0.3)

print("x_train dim:",x_train.shape, "\ty_train dim:", y_train.shape)
print("x_test dim:",x_test.shape, "\ty_test dim:", y_test.shape)

x_train dim: (692,) 	y_train dim: (692,)
x_test dim: (297,) 	y_test dim: (297,)


## Logistic Regression

It is a typical classifier and most pervasively used one too. It has great interpretability properties.
### Parameters:
1. C = inverse regularization parameter; C = 1/λ;  It’s a penalty term meant regulate against overfitting.
2. max_iter = maximum iterations done.

In [138]:
# Logistic Regression
logistic = Pipeline([('cv', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LogisticRegression(C=1000, max_iter=1000))])
logistic.fit(x_train, y_train)

y_pred = logistic.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 77.10%
                    precision    recall  f1-score   support

          Politics       0.79      0.82      0.81        33
     Non-Political       0.55      0.55      0.55        29
       Coronavirus       0.93      0.97      0.95        29
          AskIndia       0.85      0.82      0.84        34
    Policy/Economy       0.82      0.97      0.89        29
     [R]eddiquette       0.65      0.78      0.71        36
       Photography       0.73      0.73      0.73        30
  Business/Finance       0.96      0.79      0.87        29
            Sports       0.74      0.81      0.77        21
Science/Technology       0.71      0.44      0.55        27

          accuracy                           0.77       297
         macro avg       0.77      0.77      0.77       297
      weighted avg       0.77      0.77      0.77       297



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Inference from Logistic Regression:
1. The model has a total accuracy of **77%**, which is good.
2. The most performing flair category was **"Coronavirus"**.
3. The least performing flair categories were **"Non-Political"** and **"Sciene/Technology"**.

## NAIVE BAYES CLASSIFIER
Using Naive Bayes text classifiers might be a really good idea initially, especially if there’s not much training data available and computational resources are scarce. One of the most suitable variants for text is the multinomial variant.

In [133]:
# Naive Bayes
naive = Pipeline([('cv', CountVectorizer()),('tfidf', TfidfTransformer()),('nb', MultinomialNB())])
naive.fit(x_train, y_train)
y_pred = naive.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 67.68%
                    precision    recall  f1-score   support

          Politics       0.59      0.73      0.65        33
     Non-Political       0.37      0.69      0.48        29
       Coronavirus       0.88      0.97      0.92        29
          AskIndia       0.89      0.74      0.81        34
    Policy/Economy       0.82      0.93      0.87        29
     [R]eddiquette       0.59      0.61      0.60        36
       Photography       0.74      0.67      0.70        30
  Business/Finance       0.89      0.59      0.71        29
            Sports       0.67      0.76      0.71        21
Science/Technology       1.00      0.07      0.14        27

          accuracy                           0.68       297
         macro avg       0.74      0.67      0.66       297
      weighted avg       0.74      0.68      0.66       297



### Inference from Naive Bayes Classifier:
1. The model has a total accuracy of **68%**, which is decent.
2. The most performing flair category was **"Coronavirus"**.
3. The least performing flair category was **"Sciene/Technology"**.

## Random Forest Classifier

Random forest classifier suits most multi-class classification problem, also they have good interpretability and work faster.
### Parameters:
1. n_estimators = the number of trees inthe forest.

In [144]:
# Random Forest
random = Pipeline([('cv', CountVectorizer()),('tfidf', TfidfTransformer()),('rf', RandomForestClassifier(n_estimators = 1000))])
random.fit(x_train, y_train)

y_pred = random.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 82.15%
                    precision    recall  f1-score   support

          Politics       0.69      0.82      0.75        33
     Non-Political       0.73      0.55      0.63        29
       Coronavirus       0.94      1.00      0.97        29
          AskIndia       0.94      0.91      0.93        34
    Policy/Economy       0.93      0.97      0.95        29
     [R]eddiquette       0.83      0.83      0.83        36
       Photography       0.76      0.83      0.79        30
  Business/Finance       0.96      0.90      0.93        29
            Sports       0.71      0.81      0.76        21
Science/Technology       0.68      0.56      0.61        27

          accuracy                           0.82       297
         macro avg       0.82      0.82      0.81       297
      weighted avg       0.82      0.82      0.82       297



In [28]:
joblib.dump(random, 'random-forest.pkl')

['random-forest.pkl']

### Inference from Random Forest Classifier:
1. The model has a total accuracy of **82%**, which is goood.
2. The most performing flair category was **"Coronavirus"**.
3. The least performing flair category was **"Sciene/Technology"**.

## k- Nearest Neighbours Classifier
### Parameters:
1. n_neighbours = number of neighbours to take into consideration.

In [154]:
# k-Nearest Neighbours
neighbours = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', KNeighborsClassifier(n_neighbors=5))])
neighbours.fit(x_train, y_train)
y_pred = neighbours.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 61.62%
                    precision    recall  f1-score   support

          Politics       0.43      0.45      0.44        33
     Non-Political       0.36      0.59      0.45        29
       Coronavirus       0.68      0.93      0.78        29
          AskIndia       0.73      0.79      0.76        34
    Policy/Economy       0.84      0.90      0.87        29
     [R]eddiquette       0.56      0.64      0.60        36
       Photography       0.68      0.43      0.53        30
  Business/Finance       0.76      0.55      0.64        29
            Sports       0.79      0.52      0.63        21
Science/Technology       0.67      0.30      0.41        27

          accuracy                           0.62       297
         macro avg       0.65      0.61      0.61       297
      weighted avg       0.64      0.62      0.61       297



### Inference from k- Nearest Neighbours Classifier:
1. The model has a total accuracy of **62%**, which is goood.
2. The most performing flair category was **"Policy/Economy"**.
3. The least performing flair category was **"Sciene/Technology"**.

# OUT OF THE 4 MODELS USED, RANDOM FOREST PERFORMED THE BEST WITH A TOTAL ACCURACY OF 82%