
# News Text Classification

* News Category Dataset set from [Kaggle competition](https://www.kaggle.com/rmisra/news-category-dataset).
* Nice [sklearn tutorial](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html).



## 1. Understanding the data



In [1]:
import pandas as pd
import numpy as np
import pylab as plt
 
#this assumes one json item per line in json file
df=pd.read_json("./data/news_category_dataset.json", lines=True)

In [2]:
df.columns

Index(['short_description', 'headline', 'date', 'link', 'authors', 'category'], dtype='object')

## 2. Texts for Classification

These are some of the fields we can use for the classification task. We create 3 different versions.

**Tokenize the Text**

In [3]:
def Tokenize(url):   
    import re
    url=url.replace("https://www.huffingtonpost.com/entry/","")
    url=re.sub("(\W|_)+"," ",url)
    return url

df['tokenized_url']=df['link'].apply(lambda x:Tokenize(x))

#just the description
df['text_desc'] = df['short_description']

#description + headline
df['text_desc_headline'] = df['short_description'] + ' '+ df['headline']

#description + headline + tokenized url
df['text_desc_headline_url'] = df['short_description'] + ' '+ df['headline']+" " + df['tokenized_url']


In [4]:
df.head(1)

Unnamed: 0,short_description,headline,date,link,authors,category,tokenized_url,text_desc,text_desc_headline,text_desc_headline_url
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME,texas amanda painter mass shooting us 5b081ab4...,She left her husband. He killed their children...,She left her husband. He killed their children...,She left her husband. He killed their children...


In [5]:
df['tokenized_url'][0]

'texas amanda painter mass shooting us 5b081ab4e4b0802d69caad89'

## 3. Training Logistic Regression Model

### 3.1 Extract Features, Top k prediction

In [6]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [7]:
def ExtractFeatures(df, field, feature):
    # train, test, validation split (60%, 20%, 20%)
    df_train_val, df_test  = train_test_split(df, test_size=0.2, random_state = 8848)
    df_train    , df_valid = train_test_split(df_train_val, test_size=0.25, random_state = 8848)

    """Extract features for given field and using different methods"""
    # otain vectorizer for different methods
    if feature in ["binary", "counts"]:
        binary = (feature=="binary")
        vectorizer = CountVectorizer(binary=binary, max_df=0.95)
    else:
        vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95)
    vectorizer.fit_transform(df_train[field].values)
    
    X_train = vectorizer.transform(df_train[field].values)
    X_valid = vectorizer.transform(df_valid[field].values)
    X_test  = vectorizer.transform( df_test[field].values)
    
    y_train = df_train['category'].values
    y_test  =  df_test['category'].values
    y_valid = df_valid['category'].values

    return X_train, X_valid, X_test, y_train, y_valid, y_test, vectorizer


In [8]:
def TopKPrediction(model, X, k):
    # get probabilities for all the labels
    probs = model.predict_proba(X) #; print (probs.shape)
    # find the Top k values
    # Note1: np.argsort sorts starting the smallest so pick last k values for the biggest ones
    best_n = np.argsort(probs, axis=1)[:, -k:]
    # Note2: we pick the last three in that order meaning the last one is the biggest one.
    # So reverse each item so that first prediction is the top prediction
    best_n = [ item[::-1] for item in best_n]
    #convert the numbers to class using model.classes_
    preds_topk = [[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
    return preds_topk

def ComputeAccuracy(y , y_preds_topk):
    # Check if the actual label is among the top-k prediction
    return sum( [ y[i] in y_preds_topk[i] for i in range(len(y))]  ) / (len(y)+0.)

### Mean Reciprocal Rank

* Here is a [nice presentation on MRR](https://dibt.unimol.it/TAinSM2012/slides/dawn.pdf). Also check my notes for more on this.

* [Medium Blog](https://medium.com/swlh/rank-aware-recsys-evaluation-metrics-5191bba16832)

In [9]:
def ReciprocalRank(y_t, y_p):
    # add index to list only if true label is in predicted label 
    y_true_pos = [(ip+1) for ip, p in enumerate(y_p) if p == y_t]
    # find the inverse of the position if y_true in y_pred
    if len(y_true_pos) >0:
        return 1./(y_true_pos[0])
    return 0.

def MRR(y_true, y_pred):
    rr_tot = 0.
    for i in range(len(y_true)):
        rr_tot += ReciprocalRank(y_true[i], y_pred[i])
    mrr = rr_tot / (len(y_true)+0.)
    return mrr    

In [10]:
def TrainModel(df, field="text_desc", feature="binary", k=2):
    X_train, X_valid, X_test, y_train, y_valid, y_test, vectorizer = ExtractFeatures(df, field, feature)
    log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=8848,max_iter=50)
    model   = log_reg.fit(X_train, y_train)
    
    preds_top_k = TopKPrediction(model, X_train, k)
    accuracy_train = ComputeAccuracy(y_train, preds_top_k)
    mrr_train = MRR(y_train, preds_top_k)
    
    preds_top_k = TopKPrediction(model, X_valid, k)
    accuracy_valid = ComputeAccuracy(y_valid, preds_top_k)
    mrr_valid = MRR(y_valid, preds_top_k)
    
    return model, vectorizer, [accuracy_train, accuracy_valid], [mrr_train, mrr_valid]


### Model Evaluation: only Text description

In [11]:
field="text_desc"

results=[]

for feature in ["binary", "counts", "tfidf"]:
    for k in [1]:
        model, vectorizer, acc, mrr = TrainModel(df, field=field, feature=feature, k=k)
        
        print (f"Training set: accuracy= {100*np.round(acc[0], 2)} %,  MRR={100*np.round(mrr[0], 2)} \
        Validation set: accuracy= {100*np.round(acc[1], 2)} %,  MRR={100*np.round(mrr[1], 2)} ")
        results.append([field, feature, k, acc[0], acc[1], mrr[0], mrr[1] ])

        #print (f"feature: {feature} k={k} : Training set accuracy: {100*np.round(accuracy_train, 2)} % and Validation set accuracy: {100*np.round(accuracy_valid, 2)} %")


[LibLinear]Training set: accuracy= 69.0 %,  MRR=69.0         Validation set: accuracy= 41.0 %,  MRR=41.0 
[LibLinear]Training set: accuracy= 69.0 %,  MRR=69.0         Validation set: accuracy= 41.0 %,  MRR=41.0 
[LibLinear]Training set: accuracy= 47.0 %,  MRR=47.0         Validation set: accuracy= 40.0 %,  MRR=40.0 


In [12]:
print (results)

[['text_desc', 'binary', 1, 0.687050791407198, 0.4068725498039843, 0.687050791407198, 0.4068725498039843], ['text_desc', 'counts', 1, 0.6896510340965156, 0.4060724857988639, 0.6896510340965156, 0.4060724857988639], ['text_desc', 'tfidf', 1, 0.46727027855933223, 0.3953916313305064, 0.46727027855933223, 0.3953916313305064]]


### Model Evaluation: Text description plus headline

In [13]:

field = "text_desc_headline"

for feature in ["binary", "counts", "tfidf"]:
    for k in [1]:
        model, vectorizer, acc, mrr = TrainModel(df, field=field, feature=feature, k=k)
        
        print (f"Training set: accuracy= {100*np.round(acc[0], 2)} %,  MRR={100*np.round(mrr[0], 2)} \
        Validation set: accuracy= {100*np.round(acc[1], 2)} %,  MRR={100*np.round(mrr[1], 2)} ")
        results.append([field, feature, k, acc[0], acc[1], mrr[0], mrr[1] ])


[LibLinear]Training set: accuracy= 92.0 %,  MRR=92.0         Validation set: accuracy= 60.0 %,  MRR=60.0 
[LibLinear]Training set: accuracy= 92.0 %,  MRR=92.0         Validation set: accuracy= 60.0 %,  MRR=60.0 
[LibLinear]Training set: accuracy= 69.0 %,  MRR=69.0         Validation set: accuracy= 57.99999999999999 %,  MRR=57.99999999999999 


### Model Evaluation: Text description plus headline plus url

In [14]:
field="text_desc_headline_url"

for feature in ["binary", "counts", "tfidf"]:
    for k in [1]:
        model, vectorizer, acc, mrr = TrainModel(df, field=field, feature=feature, k=k)
        
        print (f"Training set: accuracy= {100*np.round(acc[0], 2)} %,  MRR={100*np.round(mrr[0], 2)} \
        Validation set: accuracy= {100*np.round(acc[1], 2)} %,  MRR={100*np.round(mrr[1], 2)} ")
        results.append([field, feature, k, acc[0], acc[1], mrr[0], mrr[1] ])



[LibLinear]Training set: accuracy= 97.0 %,  MRR=97.0         Validation set: accuracy= 63.0 %,  MRR=63.0 
[LibLinear]Training set: accuracy= 98.0 %,  MRR=98.0         Validation set: accuracy= 64.0 %,  MRR=64.0 
[LibLinear]Training set: accuracy= 72.0 %,  MRR=72.0         Validation set: accuracy= 62.0 %,  MRR=62.0 


In [15]:
print (results)

[['text_desc', 'binary', 1, 0.687050791407198, 0.4068725498039843, 0.687050791407198, 0.4068725498039843], ['text_desc', 'counts', 1, 0.6896510340965156, 0.4060724857988639, 0.6896510340965156, 0.4060724857988639], ['text_desc', 'tfidf', 1, 0.46727027855933223, 0.3953916313305064, 0.46727027855933223, 0.3953916313305064], ['text_desc_headline', 'binary', 1, 0.919645833611137, 0.5990079206336507, 0.919645833611137, 0.5990079206336507], ['text_desc_headline', 'counts', 1, 0.9237262144466817, 0.5970077606208497, 0.9237262144466817, 0.5970077606208497], ['text_desc_headline', 'tfidf', 1, 0.6850506047231075, 0.583966717337387, 0.6850506047231075, 0.583966717337387], ['text_desc_headline_url', 'binary', 1, 0.9695838278239303, 0.6279302344187535, 0.9695838278239303, 0.6279302344187535], ['text_desc_headline_url', 'counts', 1, 0.982478364647367, 0.6362108968717497, 0.982478364647367, 0.6362108968717497], ['text_desc_headline_url', 'tfidf', 1, 0.7214540023735548, 0.6205296423713897, 0.721454002

# Compiling the results

In [16]:
columns=['text_fields','feature','top_k','training_accuracy','validation_accuracy', 'training_mrr','validation_mrr'] 
df_results=pd.DataFrame(results,columns=columns)
df_results.sort_values(by=['text_fields','validation_accuracy'],ascending=False)

Unnamed: 0,text_fields,feature,top_k,training_accuracy,validation_accuracy,training_mrr,validation_mrr
7,text_desc_headline_url,counts,1,0.982478,0.636211,0.982478,0.636211
6,text_desc_headline_url,binary,1,0.969584,0.62793,0.969584,0.62793
8,text_desc_headline_url,tfidf,1,0.721454,0.62053,0.721454,0.62053
3,text_desc_headline,binary,1,0.919646,0.599008,0.919646,0.599008
4,text_desc_headline,counts,1,0.923726,0.597008,0.923726,0.597008
5,text_desc_headline,tfidf,1,0.685051,0.583967,0.685051,0.583967
0,text_desc,binary,1,0.687051,0.406873,0.687051,0.406873
1,text_desc,counts,1,0.689651,0.406072,0.689651,0.406072
2,text_desc,tfidf,1,0.46727,0.395392,0.46727,0.395392


## Check Predictions on Unseen Articles from CNN (not HuffPost our training data)

In [17]:
# https://www.cnn.com/2019/07/19/politics/george-nader-child-porn-sex-charges/index.html
X_features=vectorizer.transform(["George Aref Nader, who was a key witness in special counsel Robert Mueller's Russia investigation, faces new charges of transporting a minor with intent to engage in criminal sexual activity and child pornography"])
TopKPrediction(model, X_features, 2)

[['POLITICS', 'ENTERTAINMENT']]

In [18]:
# https://www.cnn.com/2019/07/18/entertainment/khloe-kardashian-true-thompson-video-trnd/index.html
X_features=vectorizer.transform(["True Thompson makes an adorable cameo in Khloe Kardashian's new makeup tutorial video"])
TopKPrediction(model, X_features, 2)

[['ENTERTAINMENT', 'STYLE']]

In [19]:
# https://www.cnn.com/2019/07/12/entertainment/heidi-klum-tom-kaulitz/
X_features=vectorizer.transform(["Heidi Klum is apparently the latest celeb to get married and not tell us"])
TopKPrediction(model, X_features, 2)

[['ENTERTAINMENT', 'POLITICS']]

In [20]:
# https://www.cnn.com/2019/07/19/investing/dow-stock-market-today/index.html
X_features=vectorizer.transform(["Stocks end lower as geopolitical fears rise. The Dow and US markets closed lower on Friday, as geopolitical worries overshadowed the hopes of interest rate cuts by the Federal Reserve."])
TopKPrediction(model, X_features, 2)

[['POLITICS', 'BUSINESS']]

In [21]:
# https://www.cnn.com/2019/07/19/health/astronaut-exercise-iv-faint-scn/index.html
X_features=vectorizer.transform(["Exercise in space keeps astronauts from fainting when they return to Earth, study says. "])
TopKPrediction(model, X_features, 2)

[['SCIENCE', 'HEALTHY LIVING']]

## Save Model

In [24]:
import os, pickle

pkl_dir = "./data/"

model_pkl=os.path.join(pkl_dir, "model_LR.pkl")

vectorizer_pkl = os.path.join(pkl_dir, "vectorizer_LR.pkl")

pickle.dump(model,open(model_pkl, 'wb'))
pickle.dump(vectorizer,open(vectorizer_pkl,'wb'))


## Use Loaded Model

In [25]:
model_loaded = pickle.load(open(model_pkl, 'rb'))
vectorizer_loaded = pickle.load(open(vectorizer_pkl, 'rb'))

X_features=vectorizer_loaded.transform(["President Trump AND THE impeachment story !!!"])
TopKPrediction(model_loaded, X_features, 2)


[['POLITICS', 'THE WORLDPOST']]

## Random Forrest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
def TrainModel(df, clf, field="text_desc", feature="binary", k=2):
    X_train, X_valid, X_test, y_train, y_valid, y_test, vectorizer = ExtractFeatures(df, field, feature)
    model   = clf.fit(X_train, y_train)
    
    preds_top_k = TopKPrediction(model, X_train, k)
    accuracy_train = ComputeAccuracy(y_train, preds_top_k)
    mrr_train = MRR(y_train, preds_top_k)
    
    preds_top_k = TopKPrediction(model, X_valid, k)
    accuracy_valid = ComputeAccuracy(y_valid, preds_top_k)
    mrr_valid = MRR(y_valid, preds_top_k)
    
    return model, vectorizer, [accuracy_train, accuracy_valid], [mrr_train, mrr_valid]


#clf = LogisticRegression(verbose=1, solver='liblinear',random_state=8848,max_iter=50)



In [30]:
field="text_desc_headline_url"
k=1;

clf = RandomForestClassifier(n_estimators=10)

model, vectorizer, acc, mrr = TrainModel(df, clf, field=field, feature=feature, k=k)

print (f"Training set: accuracy= {100*np.round(acc[0], 2)} %,  MRR={100*np.round(mrr[0], 2)} \
    Validation set: accuracy= {100*np.round(acc[1], 2)} %,  MRR={100*np.round(mrr[1], 2)} ")


Training set: accuracy= 87.0 %,  MRR=87.0     Validation set: accuracy= 33.0 %,  MRR=33.0 


## Future works
* xgboost
* May be Tensorflow 