In [1]:
#download file from web

import requests, zipfile, io, os

zip_file_url = "http://groups.di.unipi.it/~gulli/newsspace200.xml.bz"

filename = zip_file_url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(zip_file_url)
    f.write(r.content)

In [2]:
#create data folder, decompress data

import bz2,shutil

dirName = 'data'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")
    
    
with bz2.BZ2File("newsspace200.xml.bz") as fr, open("./data/input.xml","wb") as fw:
    shutil.copyfileobj(fr,fw)

Directory  data  Created 


In [3]:
import xml.etree.ElementTree as ET

tree = ET.parse("./data/input.xml")
root = tree.getroot()

titles = []
categories = []
descriptions = []


for title in tree.findall('title'):
    titles.append(title.text)
    
for category in tree.findall('category'):
        categories.append(category.text)

for description in tree.findall('description'):
    descriptions.append(description.text)


In [4]:
import numpy as np

In [5]:
titles = np.array(titles).T
categories = np.array(categories).T
descriptions = np.array(descriptions).T

In [6]:
data = np.dstack((titles,descriptions,categories)).reshape(len(titles),3)

In [7]:
data.shape

(496835, 3)

In [10]:
import pandas as pd

df = pd.DataFrame(data)
df.columns = ['title', 'desc', 'cat']

In [11]:
print(f"Total unique categories are: {len(df['cat'].value_counts())}")
print(f"Count of occurance of each category:")
df['cat'].value_counts()

Total unique categories are: 17
Count of occurance of each category:


World                                                                                                                                                                                       81456
Entertainment                                                                                                                                                                               70892
Sports                                                                                                                                                                                      62163
Business                                                                                                                                                                                    56656
Top Stories                                                                                                                                                                                 56045
Sci/Tech                      

In [12]:
selected_cats= ['World','Entertainment','Sports','Business','Top Stories']

df_selected = df.loc[df['cat'].isin(selected_cats)]

df_selected

Unnamed: 0,title,desc,cat
0,Wall St. Pullback Reflects Tech Blowout (Reuters),"Reuters - Wall Street's long-playing drama,\""W...",Business
1,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Business
2,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Business
3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Business
4,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Business
...,...,...,...
496829,High on priority list: Home improvement,Doc Rivers knows any postseason plans hinge on...,Sports
496830,Compromise seals climate meeting,A climate conference overcomes last-minute obj...,World
496831,Eisley enjoying his point of view,Howard Eisley has fond memories of Boston. He ...,Sports
496832,Iraqi judges quiz 'Chemical Ali',Ali Hassan al-Majid - widely known as Chemical...,World


In [13]:
print(f"Total unique categories are: {len(df_selected['cat'].value_counts())}")
print(f"Count of occurance of each category:")
df_selected['cat'].value_counts()

Total unique categories are: 5
Count of occurance of each category:


World            81456
Entertainment    70892
Sports           62163
Business         56656
Top Stories      56045
Name: cat, dtype: int64

In [14]:
df_selected.isnull().sum()

title       0
desc     2415
cat         0
dtype: int64

In [15]:
df_selected = df_selected.dropna()

In [16]:
df_selected.isnull().sum()

title    0
desc     0
cat      0
dtype: int64

In [17]:
# Check of spaces in column headline - using enumerate
spaces = []
for i, x in enumerate(df_selected['title']):
    if type(x) == str:
        if x.isspace():
            spaces.append(i)
        
print(len(spaces), 'spaces in index: ', spaces)

0 spaces in index:  []


In [18]:
# Check of spaces in column short desc - using itertuples
blanks = []  # start with an empty list

for i,tit,desc,cat in df_selected.itertuples():  # iterate over the DataFrame
    if type(desc)==str:            # avoid NaN values
        if desc.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [19]:
# Since the goal of this exercise if to identify category based on headline and short description, 
# we choose to merge them, as the vectorizer functions can't process multiple columns
X = df_selected['title']+df_selected['desc']
y = df_selected['cat']

In [20]:
X.describe()

count                                                324797
unique                                               221234
top       ADV: Try Currency Trading Risk-Free 30 Days24-...
freq                                                     60
dtype: object

In [21]:
from sklearn.model_selection import train_test_split

# Split the data into 70-30 i.e. test size of 30% to check the accuracy of the training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = True)

#Let's check the shape of the splitted data
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (227357,)
Testing Data Shape: (97440,)


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
# Let's first try with Count Vectorizer from scikit learn
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_train_cv.shape

(227357, 160531)

Doing a test training with a non-deep learning model, as a test

In [23]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_cv,y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [24]:
# Let's test it for the first 2 articles in the Test dataset
X_test1 = X_test[0:2]
print(X_test1)

470005    At Least 13 Killed in Philippines BlastPolice ...
268333    Wine sparkles in dull bourseSHARES in Australi...
dtype: object


In [25]:
X_test1_cv = cv.transform(X_test1)
clf.predict(X_test1_cv)

array(['Entertainment', 'Business'], dtype=object)

In [26]:
# Transform the test data before predicting
X_test_cv = cv.transform(X_test)

In [27]:
# Form a prediction set
predictions = clf.predict(X_test_cv)

In [30]:
import sklearn.metrics as metrics
# Report the confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

[[12922  1407   113  1178   827]
 [ 1661  5585  1838  8191  3852]
 [  137  1398 15316  1321   372]
 [ 1465  8344  1769  1679  3570]
 [  849  2891   479  2820 17456]]
               precision    recall  f1-score   support

     Business       0.76      0.79      0.77     16447
Entertainment       0.28      0.26      0.27     21127
       Sports       0.78      0.83      0.80     18544
  Top Stories       0.11      0.10      0.10     16827
        World       0.67      0.71      0.69     24495

     accuracy                           0.54     97440
    macro avg       0.52      0.54      0.53     97440
 weighted avg       0.53      0.54      0.53     97440

0.5434934318555008


In [37]:
y_train.value_counts()

World            56804
Entertainment    49744
Sports           43607
Top Stories      39217
Business         37985
Name: cat, dtype: int64

In [66]:
#remove all files 
import shutil
#shutil.rmtree("data")
#os.remove('newsspace200.xml.bz')
shutil.rmtree('data')