# NLP ML-Project

## Authors and stuff

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import string


# grab the data
news = pd.read_csv("./data/uci-news-aggregator.csv")

2022-12-02 08:09:34.690046: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set


In [2]:
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [3]:
def normalize_text(s):
    s = s.lower()
    
    s = re.sub('(https?:\/\/)(\s)?(www\.)?(\s?)(\w+\.)*([\w\-\s]+\/)*([\w-]+)\/?',' ',s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    s = re.sub("[0-9]+", " ",s)
    s = re.sub(r"\b[a-z]\b", " ", s)
    
    for ch in string.punctuation:                                                                                                     
        s = s.replace(ch, " ")
    s = re.sub('\s+',' ',s)
    
        
    s = s.strip()
    
    #print(s)
    
    return s

news['TITLE'] = [normalize_text(s) for s in news['TITLE']]


In [4]:


lens = [len(s) for s in news['TITLE']]
print(np.min(lens), np.mean(lens), np.max(lens))



0 53.148265111181075 14282


In [5]:
# pull the data into vectors
encoder = LabelEncoder()

x = news['TITLE']
y = encoder.fit_transform(news['CATEGORY'])

In [6]:


news.head()



Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,fed official says weak data caused by weather ...,http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,fed charles plosser sees high bar for change i...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,us open stocks fall after fed official hints a...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,fed risks falling behind the curve charles plo...,http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,fed plosser nasty weather has curbed job growth,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027



Data splitting


In [7]:
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y, 
    test_size=0.2, 
    shuffle=True, 
    stratify=y, 
    random_state=42)

x_train, x_val, y_train, y_val = train_test_split(
    x_train, 
    y_train, 
    test_size=0.2, 
    shuffle=True, 
    stratify=y_train, 
    random_state=42)

# take a look at the shape of each of these
print("trainining size:", x_train.shape[0])
print("validation size:", x_val.shape[0])
print("testing size:", x_test.shape[0])

trainining size: 270348
validation size: 67587
testing size: 84484



Model construction and validation


In [8]:
vectorizer = Pipeline([
    ('count', 
     CountVectorizer(
         min_df=3, 
         binary=False, 
         ngram_range=(1,3), 
         stop_words='english')),
    ('tfid', TfidfTransformer())]).fit(x_train)


x_train_vec = vectorizer.transform(x_train)
x_val_vec = vectorizer.transform(x_val)

In [9]:
nb = MultinomialNB(alpha=0.1)
nb.fit(x_train_vec, y_train)
print('validation accuracy:', np.sum(nb.predict(x_val_vec)==y_val)/len(y_val))

validation accuracy: 0.9468240933907408



Final test


In [10]:
x_test_vec = vectorizer.transform(x_test)
print('test accuracy:', np.sum(nb.predict(x_test_vec)==y_test)/len(y_test))

test accuracy: 0.9465816012499408
