In [84]:
import pandas as pd
import numpy as np
import json
import joblib
import gensim
import re
import pickle
import warnings
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [85]:
# Let's import our data
data = pd.read_csv('./processedIndia.csv')

In [86]:
# Let's list all our flairs
flairs = ['Business/Finance','Food','Sports','Politics','Science/Technology','Policy/Economy','Scheduled','Non-Political','AskIndia', 'Coronavirus', 'CAA-NRC-NPR']

In [87]:
# Let's split our data into train and test
# We will use 90:10 rule, 90% for train and 10% for test
# Let's create the function for it
# we will use train_test_split function for it which comes with sklearn
# setting random state to 42, we can set it to any fixed value

def train_test(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state = 42)
    print("Naive Bayes Classifier ")
    naivebayes_classifier(X_train, X_test, y_train, y_test)
    print("Linear Support Vector Machine ")
    lsvm(X_train, X_test, y_train, y_test)
    print("Logistic Regression ")
    logistic_regression(X_train, X_test, y_train, y_test)
    print("Random Forest ")
    random_forest(X_train, X_test, y_train, y_test)
    print("MLP Classifier ")
    mlp_classifier(X_train, X_test, y_train, y_test)

In [88]:
# Let's create a function for each classifier

# Naive Bayes
def naivebayes_classifier(X_train, X_test, y_train, y_test):
    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))
    print(classification_report(y_test, y_pred,target_names=flairs, labels=['Business/Finance','Food','Sports','Politics','Science/Technology','Policy/Economy','Scheduled','Non-Political','AskIndia', 'Coronavirus', 'CAA-NRC-NPR']))

# Linear Support Vector Machine
def lsvm(X_train, X_test, y_train, y_test):
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))
    print(classification_report(y_test, y_pred,target_names=flairs, labels=['Business/Finance','Food','Sports','Politics','Science/Technology','Policy/Economy','Scheduled','Non-Political','AskIndia', 'Coronavirus', 'CAA-NRC-NPR']))

# Logistic Regression
def logistic_regression(X_train, X_test, y_train, y_test):
    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e30)),
                 ])
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))
    print(classification_report(y_test, y_pred,target_names=flairs, labels=['Business/Finance','Food','Sports','Politics','Science/Technology','Policy/Economy','Scheduled','Non-Political','AskIndia', 'Coronavirus', 'CAA-NRC-NPR']))
    
# Random Forest
def random_forest(X_train, X_test, y_train, y_test):
    ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
    ranfor.fit(X_train, y_train)
    y_pred = ranfor.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))
    print(classification_report(y_test, y_pred,target_names=flairs, labels=['Business/Finance','Food','Sports','Politics','Science/Technology','Policy/Economy','Scheduled','Non-Political','AskIndia', 'Coronavirus', 'CAA-NRC-NPR']))

# MLP Classifier
def mlp_classifier(X_train, X_test, y_train, y_test):
    mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))
    print(classification_report(y_test, y_pred,target_names=flairs, labels=['Business/Finance','Food','Sports','Politics','Science/Technology','Policy/Economy','Scheduled','Non-Political','AskIndia', 'Coronavirus', 'CAA-NRC-NPR']))


In [60]:
warnings.filterwarnings('ignore')
print("Flair Detection using Title as a Feature:")
print("---------------------------------------")
train_test(data.title, data.flair)

Flair Detection using Title as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.9888475836431226
                    precision    recall  f1-score   support

  Business/Finance       1.00      1.00      1.00        11
              Food       1.00      1.00      1.00         2
            Sports       0.00      0.00      0.00         0
          Politics       1.00      1.00      1.00        14
Science/Technology       0.00      0.00      0.00         1
    Policy/Economy       1.00      1.00      1.00        16
         Scheduled       1.00      1.00      1.00        17
     Non-Political       1.00      1.00      1.00        39
          AskIndia       0.98      1.00      0.99       126
       Coronavirus       1.00      0.95      0.97        41
       CAA-NRC-NPR       1.00      1.00      1.00         2

         micro avg       0.99      0.99      0.99       269
         macro avg       0.82      0.81      0.81       269
      weighted avg     

In [61]:
print("Flair Detection using Body as a Feature:")
print("---------------------------------------")
train_test(data.body, data.flair)

Flair Detection using Body as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.9776951672862454
                    precision    recall  f1-score   support

  Business/Finance       1.00      0.73      0.84        11
              Food       0.00      0.00      0.00         2
            Sports       0.00      0.00      0.00         0
          Politics       1.00      1.00      1.00        14
Science/Technology       0.00      0.00      0.00         1
    Policy/Economy       1.00      1.00      1.00        16
         Scheduled       1.00      1.00      1.00        17
     Non-Political       1.00      1.00      1.00        39
          AskIndia       0.95      1.00      0.98       126
       Coronavirus       1.00      1.00      1.00        41
       CAA-NRC-NPR       1.00      1.00      1.00         2

         micro avg       0.98      0.98      0.98       269
         macro avg       0.72      0.70      0.71       269
      weighted avg      

In [55]:
print("Flair Detection using Title+Body+Comments as a Feature:")
print("---------------------------------------")
train_test(data.feature_tbc, data.flair)

Flair Detection using Title+Body+Comments as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.9090909090909091
                    precision    recall  f1-score   support

  Business/Finance       1.00      0.42      0.59        24
              Food       0.00      0.00      0.00         4
            Sports       0.00      0.00      0.00         0
          Politics       1.00      0.97      0.99        37
Science/Technology       0.00      0.00      0.00         1
    Policy/Economy       1.00      0.52      0.68        33
         Scheduled       1.00      0.92      0.96        38
     Non-Political       1.00      0.90      0.95       107
          AskIndia       0.85      1.00      0.92       336
       Coronavirus       1.00      0.91      0.95        88
       CAA-NRC-NPR       0.00      0.00      0.00         3

         micro avg       0.91      0.91      0.91       671
         macro avg       0.62      0.51      0.55       671
      wei

In [62]:
print("Flair Detection using Title+Body+URL as a Feature:")
print("---------------------------------------")
train_test(data.feature_tbu, data.flair)

Flair Detection using Title+Body+URL as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.9888475836431226
                    precision    recall  f1-score   support

  Business/Finance       1.00      1.00      1.00        11
              Food       0.00      0.00      0.00         2
            Sports       0.00      0.00      0.00         0
          Politics       1.00      1.00      1.00        14
Science/Technology       0.00      0.00      0.00         1
    Policy/Economy       1.00      1.00      1.00        16
         Scheduled       1.00      1.00      1.00        17
     Non-Political       1.00      1.00      1.00        39
          AskIndia       0.98      1.00      0.99       126
       Coronavirus       1.00      1.00      1.00        41
       CAA-NRC-NPR       1.00      1.00      1.00         2

         micro avg       0.99      0.99      0.99       269
         macro avg       0.73      0.73      0.73       269
      weighted

In [57]:
print("Flair Detection using Title+Body+Comments+URL as a Feature:")
print("---------------------------------------")
train_test(data.feature_tbcu, data.flair)

Flair Detection using Title+Body+Comments+URL as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.910581222056632
                    precision    recall  f1-score   support

  Business/Finance       1.00      0.42      0.59        24
              Food       0.00      0.00      0.00         4
            Sports       0.00      0.00      0.00         0
          Politics       1.00      0.97      0.99        37
Science/Technology       0.00      0.00      0.00         1
    Policy/Economy       1.00      0.52      0.68        33
         Scheduled       1.00      0.92      0.96        38
     Non-Political       1.00      0.91      0.95       107
          AskIndia       0.85      1.00      0.92       336
       Coronavirus       1.00      0.91      0.95        88
       CAA-NRC-NPR       0.00      0.00      0.00         3

         micro avg       0.91      0.91      0.91       671
         macro avg       0.62      0.51      0.55       671
      

### After detailed analysis of all the metrics!
### Combined feature of Title, Body and URL performed better than others.
### Logistic Regression is Chosen!

In [89]:
X=data.title
y=data.flair
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state = 42)
logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e30)),
                 ])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [91]:
# Let's Dump our model
pickle.dump(logreg, open('final_model.pkl','wb'))