In [21]:
import pandas as pd
import numpy as np
import pylab as plt
 
#this assumes one json item per line in json file
df=pd.read_json("./data/news_category_dataset.json", lines=True)

In [22]:
import re

def tokenize_url(url:str):   
    url=url.replace("https://www.huffingtonpost.com/entry/","")
    url=re.sub("(\W|_)+"," ",url)
    return url

df['tokenized_url']=df['link'].apply(lambda x:tokenize_url(x))

#just the description
df['text_desc'] = df['short_description']

#description + headline
df['text_desc_headline'] = df['short_description'] + ' '+ df['headline']

#description + headline + tokenized url
df['text_desc_headline_url'] = df['short_description'] + ' '+ df['headline']+" " + df['tokenized_url']


In [23]:
df.head(1)

Unnamed: 0,short_description,headline,date,link,authors,category,tokenized_url,text_desc,text_desc_headline,text_desc_headline_url
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME,texas amanda painter mass shooting us 5b081ab4...,She left her husband. He killed their children...,She left her husband. He killed their children...,She left her husband. He killed their children...


# Model Building

In [24]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [25]:
df_train, df_test = train_test_split(df,random_state = 8848)

Y_train= df_train['category'].values
Y_test = df_test['category'].values
     
vectorizer = CountVectorizer(binary=True, max_df=0.95)

field="text_desc"

vectorizer.fit_transform(df_train[field].values)
X_train = vectorizer.transform(df_train[field].values)
X_test  = vectorizer.transform(df_test[field].values)


In [26]:
print (f"X_train.shape: {X_train.shape}; X_test.shape: {X_test.shape} " )
print (f"Y_train.shape: {Y_train.shape}; Y_test.shape: {Y_test.shape} " )

X_train.shape: (93741, 47041); X_test.shape: (31248, 47041) 
Y_train.shape: (93741,); Y_test.shape: (31248,) 


In [27]:
#log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=100)
log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, penalty='l2',max_iter=10)
model = log_reg.fit(X_train,Y_train)

[LibLinear]



In [28]:

def TopKPrediction(model, X, k=3):
    # get probabilities for all the labels
    probs = model.predict_proba(X)
    #print (probs.shape)
    # find the Top k values
    # Note1: np.argsort sorts starting the smallest so pick last k values for the biggest ones
    best_n = np.argsort(probs, axis=1)[:, -k:]
    # Note2: we pick the last three in that order meaning the last one is the biggest one.
    # So reverse each item so that first prediction is the top prediction
    best_n = [ item[::-1] for item in best_n]
    #convert the numbers to class using model.classes_
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
    return preds

#preds = TopKPrediction(model, X_test, k)

#print (model.classes_)

In [29]:
def ComputeAccuracy(y , y_preds_topk):
    # Check if the actual label is among the top-k prediction
    return sum( [ y[i] in y_preds_topk[i] for i in range(len(y))]  ) / (len(y)+0.)

In [30]:
# 

Y_pred_topk= TopKPrediction(model, X_train, k=3)
accuracy = ComputeAccuracy(Y_train , Y_pred_topk)
print (f"Train set accuracy: {100*np.round(accuracy, 2)} %")

Y_pred_topk= TopKPrediction(model, X_test, k=3)
accuracy = ComputeAccuracy(Y_test , Y_pred_topk)
print (f"Test set accuracy: {100*np.round(accuracy, 2)} %")

Train set accuracy: 82.0 %
Test set accuracy: 62.0 %


In [38]:

def ReciprocalRank(y_t, y_p):
    # add index to list only if true label is in predicted label 
    y_true_pos = [(idx+1) for idx, p in enumerate(y_p) if p ==y_t]
    # find the inverse of the position if y_true in y_pred
    if len(y_true_pos) >0:
        # for RR we need position of first correct item
        return 1./(y_true_pos[0])
    
    return 0.
        
def MRR(y_true, y_pred):
    rr_tot = 0.
    for i in range(len(y_true)):
        rr_tot += ReciprocalRank(y_true[i], y_pred[i])
    mrr = rr_tot / (len(y_true)+0.)    
    return mrr    

In [36]:
def _reciprocal_rank(true_labels: list, machine_preds: list):
    """Compute the reciprocal rank at cutoff k"""
    
    # add index to list only if machine predicted label exists in true labels
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels]

    rr = 0
    if len(tp_pos_list) > 0:
        # for RR we need position of first correct item
        first_pos_list = tp_pos_list[0]
        rr = 1 / float(first_pos_list)
    return rr

def compute_mrr_at_k(item0, item1):
    """Compute the MRR (average RR) at cutoff k"""
    rr_total = 0
    
    for i in range(len(item0)):
        rr_at_k = _reciprocal_rank(item0[0],item1[i])
        rr_total = rr_total + rr_at_k
        mrr = rr_total / 1/float(len(item0))

    return mrr

In [39]:

mrr= compute_mrr_at_k(Y_test , Y_pred_topk)
print (mrr)

print (MRR(Y_test , Y_pred_topk))

0.039629202935654514
0.5024534903567329
