In [1]:
# import libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [2]:
sections = [('arts', 'Ch4/Arts.csv',0), 
            ('business','Ch4/Business.csv', 1), 
            ('obituaries','Ch4/Obituaries.csv', 2), 
            ('sports','Ch4/Sports.csv', 3), 
            ('world', 'Ch4/World.csv', 4)]
for section,file, encoded_section in sections:
    with open(file) as f:
        content = f.readlines()
        content = [x.split(sep='\t', maxsplit=2) for x in content] 
        globals()[section] = pd.DataFrame(content, columns=['url', 'title', 'body'])
        globals()[section]['target'] = encoded_section
arts.describe()


Unnamed: 0,target
count,1000.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [3]:
# parse body. 
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

def get_text(text):
    # remove style and script elements
    soup = BeautifulSoup(text, 'lxml')
    for script in soup(["script", "style"]):
        script.decompose()

    # tokenize and remove stop words
    return soup.get_text()

# tokenize title and body
for t,s,target in sections:
    globals()[t]['title_parsed'] = globals()[t].apply(lambda row: get_text(row['title']), axis = 1)
    globals()[t]['body_parsed'] = globals()[t].apply(lambda row: get_text(row['body']), axis = 1)

    

In [4]:
sports.describe()
sports.head(5)

Unnamed: 0,url,title,body,target,title_parsed,body_parsed
0,https://www.nytimes.com/2018/04/26/arts/design...,10 Galleries to Visit Now on the Upper East Side,<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,3,10 Galleries to Visit Now on the Upper East Side,10 Galleries to Visit Now on the Upper East ...
1,https://www.nytimes.com/aponline/2018/04/26/sp...,"The Latest: Lions, Bengals Each Draft Centers",<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,3,"The Latest: Lions, Bengals Each Draft Centers",The Latest: Packers and Cowboys Add Defensiv...
2,https://www.nytimes.com/2018/04/26/sports/nfl-...,NFL Draft 2018 Live: Round 1 Pick-by-Pick Updates,<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,3,NFL Draft 2018 Live: Round 1 Pick-by-Pick Updates,NFL Draft 2018 Live: Round 1 Pick-by-Pick Up...
3,https://www.nytimes.com/2018/04/26/nyregion/la...,"The Lawyer at the Side of de Blasio, Cuomo and...",<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,3,"The Lawyer at the Side of de Blasio, Cuomo and...","The Lawyer at the Side of de Blasio, Cuomo a..."
4,https://www.nytimes.com/aponline/2018/04/26/sp...,"Game 6s on Tap: LeBron, Raptors, Jazz Look to ...",<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,3,"Game 6s on Tap: LeBron, Raptors, Jazz Look to ...","Game 6s on Tap: LeBron, Raptors, Jazz Look t..."


In [6]:
# test-train split
from sklearn.model_selection import train_test_split

for df,file,target in sections:
    train, test = train_test_split(globals()[df], test_size=0.5)
    globals()[df + "_train"] = train
    globals()[df + "_test"] = test

print(sports_train.head(2))
print(business_test.head(2))

                                                   url  \
336  https://www.nytimes.com/2018/04/13/opinion/dep...   
112  https://www.nytimes.com/aponline/2018/04/23/sp...   

                                              title  \
336              How Sky Diving Cured My Depression   
112  White Sox Beat Mariners 10-4, Stop 7-Game Skid   

                                                  body  target  \
336  <!DOCTYPE html><html lang="en" itemId="https:/...       3   
112  <!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...       3   

                                       title_parsed  \
336              How Sky Diving Cured My Depression   
112  White Sox Beat Mariners 10-4, Stop 7-Game Skid   

                                           body_parsed  
336        Opinion | How Sky Diving Cured My Depres...  
112    White Sox Beat Mariners 10-4, Stop 7-Game Sk...  
                                                   url  \
90   https://www.nytimes.com/reuters/2018/04/25/bus...   
667  ht

In [7]:
# concatenate the data frames
train = pd.concat([arts_train, business_train, obituaries_train, sports_train, world_train])
test = pd.concat([arts_test, business_test, obituaries_test, sports_test, world_test])


In [11]:
# try multinomial NB
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

text_clf.fit(train['body_parsed'], train['target'])  

predicted = text_clf.predict(test['body_parsed'])
np.mean(predicted == test.target)  

0.65039999999999998

In [14]:
# try with SVM
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            max_iter=5, tol=None))
                   ])
text_clf.fit(train['body_parsed'], train['target'])  
predicted = text_clf.predict(test['body_parsed'])
np.mean(predicted == test.target)  

0.69399999999999995

In [16]:
# print metrics
from sklearn import metrics

print(metrics.classification_report(test.target, predicted, target_names=['Arts', 'Business', 'Obituaries', 'Sports', 'World']))
metrics.confusion_matrix(test.target, predicted)

             precision    recall  f1-score   support

       Arts       0.61      0.80      0.69       500
   Business       0.66      0.72      0.69       500
 Obituaries       0.86      0.88      0.87       500
     Sports       0.67      0.63      0.65       500
      World       0.68      0.44      0.53       500

avg / total       0.70      0.69      0.69      2500



array([[399,  11,  36,  40,  14],
       [ 48, 362,   5,  36,  49],
       [ 38,   1, 439,  17,   5],
       [ 77,  52,  21, 317,  33],
       [ 87, 120,  12,  63, 218]])