In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC 
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
classification_report,cohen_kappa_score,make_scorer,roc_auc_score

In [2]:
#Reading the training and testing data
df1 = pd.read_csv('training.csv')
df2 = pd.read_csv('test.csv')

In [3]:
#One Hot Encoding to quantitatively represent the topics
encoding = {'topic' : 
            {'IRRELEVANT' : 0,
             'ARTS CULTURE ENTERTAINMENT':1,
             'BIOGRAPHIES PERSONALITIES PEOPLE':2,
             'DEFENCE' : 3, 
             'DOMESTIC MARKETS' : 4, 
             'FOREX MARKETS' : 5, 
             'HEALTH' : 6, 
             'MONEY MARKETS' : 7,
             'SCIENCE AND TECHNOLOGY' : 8, 
             'SHARE LISTINGS' : 9, 
             'SPORTS' :10,
             }}
encoding

{'topic': {'IRRELEVANT': 0,
  'ARTS CULTURE ENTERTAINMENT': 1,
  'BIOGRAPHIES PERSONALITIES PEOPLE': 2,
  'DEFENCE': 3,
  'DOMESTIC MARKETS': 4,
  'FOREX MARKETS': 5,
  'HEALTH': 6,
  'MONEY MARKETS': 7,
  'SCIENCE AND TECHNOLOGY': 8,
  'SHARE LISTINGS': 9,
  'SPORTS': 10}}

In [4]:
# Create bag of words
ngram_range = (1,2)
min_df = 10    #When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
max_df = 1.   #max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
max_features = 1500
#count = TfidfVectorizer()
count= TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                        use_idf=True)
bag_of_words = count.fit(df1['article_words'])

In [5]:
#Training and Testing split - X and Y
x_train = df1['article_words']
y_train = df1['topic'].to_list()
x_test = df2['article_words']
y_test = df2['topic'].to_list()

In [6]:
#Transforming the testing and training
x_train = bag_of_words.transform(x_train)
x_test = bag_of_words.transform(x_test)

## Cross Validation on selecting metrics

In [36]:
classifiers = [
    SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False),
    RFC(n_estimators=200, #the number of trees in the forest 9
         criterion='gini', #a way to decide the attribute
         max_depth=80, #max depth of the tree
         min_samples_split=9, #The minimum number of samples required to split an internal node
         min_samples_leaf=1, #The minimum number of samples required to be at a leaf node
         max_features='sqrt', #The number of features to consider when looking for the best split:
         max_leaf_nodes=None,
         bootstrap=False,
         warm_start=False), #When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior='True'),
    KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='distance')
]

def cohen(y_true,y_predict): return cohen_kappa_score(y_true,y_predict)
def accuracy(y_true,y_predict): return accuracy_score(y_true,y_predict)
def precision(y_true,y_predict): return precision_score(y_true,y_predict,average='macro')
def recall(y_true,y_predict): return recall_score(y_true,y_predict,average='macro')
#def roc_auc(y_true,y_predict): return roc_auc_score(y_true,y_predict,average='macro')
scoring = {'cohen':make_scorer(cohen),'accuracy':make_scorer(accuracy),
           'precision':make_scorer(precision),'recall':make_scorer(recall)}
classifier_accuracy_list = []
for i, classifier in enumerate(classifiers):
        # split the dataset into 5 folds; then test the classifier against each fold one by one
        accuracies = cross_validate(classifier, x_train, y_train, cv=5, scoring=scoring)
        #classifier_accuracy_list.append((accuracies.mean(), type(classifier).__name__))
        classifier_accuracy_list.append((accuracies, type(classifier).__name__))        

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [50]:
for e in classifier_accuracy_list:
    print(f"classifier {e[1]} \n cohen_mean:{np.mean(e[0]['test_cohen'])},accuracy_mean: {np.mean(e[0]['test_accuracy'])},\
                                 precision_mean:{np.mean(e[0]['test_precision'])},recall_mean:{np.mean(e[0]['test_recall'])}")

classifier SVC 
 cohen_mean:0.6715813798798351,accuracy_mean: 0.7766303887840691,                                 precision_mean:0.7150691485941267,recall_mean:0.557924061423529
classifier RandomForestClassifier 
 cohen_mean:0.6213879156962081,accuracy_mean: 0.7510590966940207,                                 precision_mean:0.732372305476205,recall_mean:0.4034624129358716
classifier MultinomialNB 
 cohen_mean:0.6152752074072937,accuracy_mean: 0.7410603270410656,                                 precision_mean:0.5870507002421852,recall_mean:0.40425414539714444
classifier KNeighborsClassifier 
 cohen_mean:0.6450623282008181,accuracy_mean: 0.7536841582707059,                                 precision_mean:0.6887429887373889,recall_mean:0.5476950469432813


[({'fit_time': array([15.2449069 , 15.34216809, 15.0947392 , 15.13932395, 15.12666702]),
   'score_time': array([16.63713312, 16.85792518, 17.07731414, 17.06618714, 16.90793872]),
   'test_cohen': array([0.66805066, 0.68037011, 0.66861481, 0.67099632, 0.66987501]),
   'test_accuracy': array([0.77573529, 0.78098739, 0.77327722, 0.77689873, 0.7762533 ]),
   'test_precision': array([0.6731202 , 0.72841684, 0.69689268, 0.72329267, 0.75362335]),
   'test_recall': array([0.53541098, 0.58114242, 0.56374538, 0.55980345, 0.54951807])},
  'SVC'),
 ({'fit_time': array([23.03527713, 22.7660141 , 23.00905919, 22.60920095, 22.61260986]),
   'score_time': array([0.88352299, 0.91984105, 0.86381316, 0.83330607, 0.83615112]),
   'test_cohen': array([0.62695393, 0.61357241, 0.61211948, 0.6274649 , 0.62682885]),
   'test_accuracy': array([0.75472689, 0.74369748, 0.74539716, 0.75685654, 0.75461741]),
   'test_precision': array([0.70966642, 0.70601845, 0.74182318, 0.74271743, 0.76163605]),
   'test_recall':