In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC 
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
classification_report,cohen_kappa_score,make_scorer,roc_auc_score

In [2]:
#Reading the training and testing data
df1 = pd.read_csv('training.csv')
df2 = pd.read_csv('test.csv')

In [4]:
#One Hot Encoding to quantitatively represent the topics
encoding = {'topic' : 
            {'IRRELEVANT' : 0,
             'ARTS CULTURE ENTERTAINMENT':1,
             'BIOGRAPHIES PERSONALITIES PEOPLE':2,
             'DEFENCE' : 3, 
             'DOMESTIC MARKETS' : 4, 
             'FOREX MARKETS' : 5, 
             'HEALTH' : 6, 
             'MONEY MARKETS' : 7,
             'SCIENCE AND TECHNOLOGY' : 8, 
             'SHARE LISTINGS' : 9, 
             'SPORTS' :10,
             }}


In [5]:
# Create bag of words
ngram_range = (1,2)
min_df = 10    #When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
max_df = 1.   #max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
max_features = 1500
#count = TfidfVectorizer()
count= TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                        use_idf=True)
bag_of_words = count.fit(df1['article_words'])

In [6]:
#Training and Testing split - X and Y
x_train = df1['article_words']
y_train = df1['topic'].to_list()
x_test = df2['article_words']
y_test = df2['topic'].to_list()

In [7]:
#Transforming the testing and training
x_train = bag_of_words.transform(x_train)
x_test = bag_of_words.transform(x_test)

In [8]:

classifier = SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model = classifier.fit(x_train,y_train)


In [9]:
y_predict = model.predict(x_test)

## Cosine similarity

In [10]:
#ds = pd.read_csv("/home/nikita/Downloads/sample-data.csv")
def recommendation(test,y_predict,train_data,topic_dict,topic_list):
    test_data = test.copy(deep=True)
    test_data['topic'] = y_predict
    #topic_list = [i for i in range(11)]
    tf = TfidfVectorizer()
    model = tf.fit(train_data['article_words'])
    article_list = []
    for t in topic_list:
        if not test_data[test_data['topic']==t].empty and t != 'IRRELEVANT':        
            tfidf_train = model.transform(train_data[train_data['topic']==t]['article_words']).toarray()
            tfidf_test = model.transform(test_data[test_data['topic']==t]['article_words']).toarray()
            test_article_number = test_data[test_data['topic']==t]['article_number'].tolist()
            cosine = cosine_similarity(tfidf_test,tfidf_train)
            cosine = np.sort(cosine)
            suggested_list = np.argsort(cosine[:,-1])[-10:].tolist()
            article_number = [test_article_number[i] for i in suggested_list]
            article_list += article_number
            article_number = ",".join([str(test_article_number[i]) for i in suggested_list])         
            print(f"For topic {t} recommending article {article_number}")
    y_rec_true = [(test[test['article_number']==i]['topic']).tolist()[0] for i in article_list]
    y_rec_predict = [(test_data[test_data['article_number']==i]['topic']).tolist()[0] for i in article_list]
    return y_rec_true,y_rec_predict

In [11]:
topic_dict = {encoding['topic'][k] : k for k in encoding['topic']}
topic_dict
y_rec_true,y_rec_predict = recommendation(df2,y_predict,df1,topic_dict,encoding['topic'])
d = {'y_rec_true':y_rec_true,'y_rec_predict':y_rec_predict}
# Dataframe of the recommendation topic and the true topic
df_rec = pd.DataFrame(d)

For topic ARTS CULTURE ENTERTAINMENT recommending article 9703,9830,9933,9952
For topic BIOGRAPHIES PERSONALITIES PEOPLE recommending article 9896,9526,9988,9940
For topic DEFENCE recommending article 9607,9770,9616,9670,9559,9759,9987,9576,9773
For topic DOMESTIC MARKETS recommending article 9640,9989
For topic FOREX MARKETS recommending article 9671,9565,9530,9529,9977,9551,9986,9682,9588,9798
For topic HEALTH recommending article 9982,9947,9810,9661,9873,9621,9929,9807,9833,9926
For topic MONEY MARKETS recommending article 9961,9808,9678,9766,9725,9555,9761,9853,9634,9586
For topic SHARE LISTINGS recommending article 9562,9666,9715,9601,9518
For topic SPORTS recommending article 9787,9992,9791,9754,9630,9752,9608,9513,9520,9800


In [12]:
print(classification_report(y_rec_true, y_rec_predict))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.50      1.00      0.67         2
BIOGRAPHIES PERSONALITIES PEOPLE       0.75      0.60      0.67         5
                         DEFENCE       0.78      1.00      0.88         7
                DOMESTIC MARKETS       0.00      0.00      0.00         0
                   FOREX MARKETS       0.60      0.60      0.60        10
                          HEALTH       0.80      1.00      0.89         8
                      IRRELEVANT       0.00      0.00      0.00        10
                   MONEY MARKETS       0.50      0.56      0.53         9
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         1
                  SHARE LISTINGS       0.60      1.00      0.75         3
                          SPORTS       0.90      1.00      0.95         9

                        accuracy                           0.67        64
                       macro avg    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
