<a href="https://colab.research.google.com/github/krisivarga/deep_learning_HW_big/blob/gym_mod/get_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
#download file from web

import requests, zipfile, io, os

zip_file_url = "http://groups.di.unipi.it/~gulli/newsspace200.xml.bz"

filename = zip_file_url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(zip_file_url)
    f.write(r.content)

In [24]:
#create data folder, decompress data

import bz2,shutil

dirName = 'data'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")
    
    
with bz2.BZ2File("newsspace200.xml.bz") as fr, open("./data/input.xml","wb") as fw:
    shutil.copyfileobj(fr,fw)

Directory  data  already exists


In [25]:
import xml.etree.ElementTree as ET

tree = ET.parse("./data/input.xml")
root = tree.getroot()

titles = []
categories = []
descriptions = []
sources = []


for title in tree.findall('title'):
    titles.append(title.text)
    
for category in tree.findall('category'):
        categories.append(category.text)

for description in tree.findall('description'):
    descriptions.append(description.text)

for source in tree.findall('source'):
    sources.append(source.text)


In [26]:
import numpy as np

In [27]:
titles = np.array(titles).T
categories = np.array(categories).T
descriptions = np.array(descriptions).T
sources = np.array(sources).T

In [28]:
data = np.dstack((sources,titles,descriptions,categories)).reshape(len(titles),4)

In [29]:
data.shape

(496835, 4)

In [30]:
import pandas as pd

df = pd.DataFrame(data)
df.columns = ['source','title', 'desc', 'cat']

In [31]:
print(f"Total unique categories are: {len(df['cat'].value_counts())}")
print(f"Count of occurance of each category:")
df['cat'].value_counts()

Total unique categories are: 17
Count of occurance of each category:


World                                                                                                                                                                                       81456
Entertainment                                                                                                                                                                               70892
Sports                                                                                                                                                                                      62163
Business                                                                                                                                                                                    56656
Top Stories                                                                                                                                                                                 56045
Sci/Tech                      

In [32]:
selected_cats = df['cat'].value_counts()[:5].index.tolist()
print(selected_cats)

df_selected = df.loc[df['cat'].isin(selected_cats)]

print(df_selected)

['World', 'Entertainment', 'Sports', 'Business', 'Top Stories']
                       source  ...       cat
0              Yahoo Business  ...  Business
1              Yahoo Business  ...  Business
2              Yahoo Business  ...  Business
3              Yahoo Business  ...  Business
4              Yahoo Business  ...  Business
...                       ...  ...       ...
496829  New York Times sports  ...    Sports
496830         BBC News world  ...     World
496831  New York Times sports  ...    Sports
496832         BBC News world  ...     World
496833  New York Times sports  ...    Sports

[327212 rows x 4 columns]


In [33]:
print(f"Total unique categories are: {len(df_selected['cat'].value_counts())}")
print(f"Count of occurance of each category:")
df_selected['cat'].value_counts()

Total unique categories are: 5
Count of occurance of each category:


World            81456
Entertainment    70892
Sports           62163
Business         56656
Top Stories      56045
Name: cat, dtype: int64

In [34]:
df_selected.isnull().sum()

source       0
title        0
desc      2415
cat          0
dtype: int64

In [35]:
df_selected = df_selected.dropna()

In [36]:
df_selected.isnull().sum()

source    0
title     0
desc      0
cat       0
dtype: int64

In [37]:
# Check of spaces in column headline - using enumerate
spaces = []
for i, x in enumerate(df_selected['title']):
    if type(x) == str:
        if x.isspace():
            spaces.append(i)
        
print(len(spaces), 'spaces in index: ', spaces)

0 spaces in index:  []


In [38]:
# Check of spaces in column short desc - using itertuples
blanks = []  # start with an empty list

for i,sou,tit,desc,cat in df_selected.itertuples():  # iterate over the DataFrame
    if type(desc)==str:            # avoid NaN values
        if desc.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


Although it is part of the database, we are not going to use the 'source' field. When we print the value_counts for that field joined with the categories, it's obvious that the category can be found in the 'source' most of the time, which defeats the point of this homework.

In [39]:
print(df_selected['source'].value_counts())
print(df_selected.value_counts(subset=['source','cat'])[:30])
print(df_selected.value_counts(subset=['source','cat'])[100:130])

Reuters                 15935
Yahoo World             11193
RedNova general          7855
Reuters Business         6897
Yahoo Sports             6850
                        ...  
The Edge Daily              1
The Enquirer-Journal        1
GoDuke.com                  1
Invest Valley               1
Strategy Page               1
Name: source, Length: 3053, dtype: int64
source                      cat          
Yahoo World                 World            11193
RedNova general             World             7855
Reuters Business            Business          6897
Yahoo Sports                Sports            6850
Yahoo Entertainment         Entertainment     6741
Reuters World               World             6106
BBC News world              World             6038
Yahoo Politics              World             4586
New York Times sports       Sports            4361
Reuters                     Top Stories       3710
                            Business          3659
                          

Next, we define tokenization and vocabulary building. We're using the WordPiece based BERT tokenizer from huggingface.co. We also tried the SentencePie based XLNetTokenizer, but the results weren't as good as with the BERT tokenizer. The methods that are defined here will be applied to every line of the dataframe.  
  
We decided on one method, which tokenizes the title and the description and after that, it removes the tokens that contain non-alphabet characters. We do this because some descriptions contain obviously unwanted characters, like '\\' (and as we can later see from the vocabulary "39" exists more than 149000 times in the text). We also remove any token that is less than 3 characters. We build the vocabualry from these tokens.  
    
The other method uses the encode_plus function which automatically encodes the tokens too. In the function parameters we set the maximum length to 512, set the padding to extend to the max length, added special tokens and didn't take the attention mask. Depending on what modles we use, we may later need to change those settings (for example if we use BERT or XLNet models). We created a vocabulary based on the tokens too.  
  
The vocabularies can contain data multiple times, because we can see the number of appearance of tokens in the text that way if we use the value_counts function.

In [40]:
!pip install sentencepiece
!pip install transformers
from transformers import XLNetTokenizer
from transformers import BertTokenizer
import re

regex = re.compile('^[a-zA-Z]{3,}$')

xltokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

btokenizer = BertTokenizer.from_pretrained("bert-base-uncased")





In [79]:
def tr(bt,reg,title,sz):
  a = bt.tokenize(title + " " + sz)
  r = []
  for s in a:
    if reg.match(s) is not None:     
      r.append(s)      
  return r

df_selected['berttoken'] = df_selected.apply(lambda row: tr(btokenizer, regex, str(row['title']), str(row['desc'])), axis=1)
df_selected

KeyboardInterrupt: ignored

In [43]:
def bertenc(bt,voc,title,sz):
  text_to_encode = title + " " + sz
  a = bt.encode_plus(
            text_to_encode, 
            max_length= 512, 
            add_special_tokens=True,
            return_token_type_ids=False, 
            padding="max_length",
            truncation = True,
            return_attention_mask=False
        )
  ids = a['input_ids']  
  return ids

df_selected['bertencodeplus'] = df_selected.apply(lambda row: bertenc(btokenizer, str(row['title']), str(row['desc'])), axis=1)

df_selected.to_csv("df_selected_export.csv", index=False, sep="\t")
df_selected

Unnamed: 0,source,title,desc,cat,berttoken,bertencodeplus
0,Yahoo Business,Wall St. Pullback Reflects Tech Blowout (Reuters),"Reuters - Wall Street's long-playing drama,\""W...",Business,"[wall, pull, reflects, tech, blow, reuters, re...","[101, 2813, 2358, 1012, 4139, 5963, 11138, 662..."
1,Yahoo Business,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Business,"[wall, bears, claw, back, into, the, black, re...","[101, 2813, 2358, 1012, 6468, 15020, 2067, 204..."
2,Yahoo Business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Business,"[carly, looks, toward, commercial, aerospace, ...","[101, 18431, 2571, 3504, 2646, 3293, 13395, 10..."
3,Yahoo Business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Business,"[oil, and, economy, cloud, stocks, outlook, re...","[101, 3514, 1998, 4610, 6112, 15768, 1005, 176..."
4,Yahoo Business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Business,"[iraq, halt, oil, exports, from, main, souther...","[101, 5712, 9190, 2015, 3514, 14338, 2013, 236..."
...,...,...,...,...,...,...
496829,New York Times sports,High on priority list: Home improvement,Doc Rivers knows any postseason plans hinge on...,Sports,"[high, priority, list, home, improvement, doc,...","[101, 2152, 2006, 9470, 2862, 1024, 2188, 7620..."
496830,BBC News world,Compromise seals climate meeting,A climate conference overcomes last-minute obj...,World,"[compromise, seals, climate, meeting, climate,...","[101, 12014, 13945, 4785, 3116, 1037, 4785, 30..."
496831,New York Times sports,Eisley enjoying his point of view,Howard Eisley has fond memories of Boston. He ...,Sports,"[enjoying, his, point, view, howard, has, fond...","[101, 1041, 2483, 3051, 9107, 2010, 2391, 1997..."
496832,BBC News world,Iraqi judges quiz 'Chemical Ali',Ali Hassan al-Majid - widely known as Chemical...,World,"[iraqi, judges, quiz, chemical, ali, ali, hass...","[101, 8956, 6794, 19461, 1005, 5072, 4862, 100..."


Here we can see the result of the encoded vocabulary with the token values.

In [56]:
bertencidvoc_dict = {}
bertvoc_dict = {}
def dict_creater(arr):
  for a in arr:
    if a != 0:
      if a in bertencidvoc_dict:
        count = bertencidvoc_dict[a] + 1
        bertencidvoc_dict.update({a:count})
      else:
        bertencidvoc_dict.update({a:1})
  return

def dict_creater2(arr):
  for a in arr:
    if a != 0:
      if a in bertvoc_dict:
        count = bertvoc_dict[a] + 1
        bertvoc_dict.update({a:count})
      else:
        bertvoc_dict.update({a:1})
  return

df_selected.apply(lambda row: dict_creater(row['bertencodeplus']), axis=1)

df_selected.apply(lambda row: dict_creater2(row['berttoken']), axis=1)

bertvoc_dict

{'wall': 3388,
 'pull': 1820,
 'reflects': 134,
 'tech': 1682,
 'blow': 1375,
 'reuters': 39445,
 'street': 3845,
 'long': 6772,
 'playing': 1529,
 'drama': 900,
 'waiting': 835,
 'for': 133895,
 'google': 2773,
 'about': 17594,
 'reach': 2367,
 'its': 43224,
 'final': 8332,
 'act': 1249,
 'but': 23801,
 'stock': 4308,
 'market': 6782,
 'debut': 1434,
 'ending': 1714,
 'more': 23353,
 'nostalgia': 30,
 'event': 1675,
 'than': 17414,
 'the': 570053,
 'catalyst': 27,
 'new': 50937,
 'era': 1154,
 'bears': 783,
 'claw': 42,
 'back': 12031,
 'into': 17641,
 'black': 2809,
 'short': 2838,
 'sellers': 76,
 'band': 1291,
 'ultra': 120,
 'are': 23633,
 'seeing': 321,
 'green': 2510,
 'again': 4971,
 'carly': 126,
 'looks': 1331,
 'toward': 2296,
 'commercial': 987,
 'aerospace': 174,
 'private': 1859,
 'investment': 1591,
 'firm': 3156,
 'group': 12423,
 'which': 11381,
 'has': 51516,
 'reputation': 369,
 'making': 3040,
 'well': 3024,
 'timed': 71,
 'and': 184938,
 'occasionally': 50,
 'contr

We encoded the categories with numbers, so they're easier to work with.

In [57]:
i = 0
category_dictionary = {}
cat_rev_dict= {}
for s in selected_cats:
  category_dictionary.update({i:s})
  cat_rev_dict.update({s:i})
  i = i + 1
print(category_dictionary)

df_selected['cat_enc'] = df_selected.apply(lambda row: cat_rev_dict[str(row['cat'])] , axis=1)
df_selected


{0: 'World', 1: 'Entertainment', 2: 'Sports', 3: 'Business', 4: 'Top Stories'}


Unnamed: 0,source,title,desc,cat,berttoken,bertencodeplus,cat_enc
0,Yahoo Business,Wall St. Pullback Reflects Tech Blowout (Reuters),"Reuters - Wall Street's long-playing drama,\""W...",Business,"[wall, pull, reflects, tech, blow, reuters, re...","[101, 2813, 2358, 1012, 4139, 5963, 11138, 662...",3
1,Yahoo Business,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Business,"[wall, bears, claw, back, into, the, black, re...","[101, 2813, 2358, 1012, 6468, 15020, 2067, 204...",3
2,Yahoo Business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Business,"[carly, looks, toward, commercial, aerospace, ...","[101, 18431, 2571, 3504, 2646, 3293, 13395, 10...",3
3,Yahoo Business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Business,"[oil, and, economy, cloud, stocks, outlook, re...","[101, 3514, 1998, 4610, 6112, 15768, 1005, 176...",3
4,Yahoo Business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Business,"[iraq, halt, oil, exports, from, main, souther...","[101, 5712, 9190, 2015, 3514, 14338, 2013, 236...",3
...,...,...,...,...,...,...,...
496829,New York Times sports,High on priority list: Home improvement,Doc Rivers knows any postseason plans hinge on...,Sports,"[high, priority, list, home, improvement, doc,...","[101, 2152, 2006, 9470, 2862, 1024, 2188, 7620...",2
496830,BBC News world,Compromise seals climate meeting,A climate conference overcomes last-minute obj...,World,"[compromise, seals, climate, meeting, climate,...","[101, 12014, 13945, 4785, 3116, 1037, 4785, 30...",0
496831,New York Times sports,Eisley enjoying his point of view,Howard Eisley has fond memories of Boston. He ...,Sports,"[enjoying, his, point, view, howard, has, fond...","[101, 1041, 2483, 3051, 9107, 2010, 2391, 1997...",2
496832,BBC News world,Iraqi judges quiz 'Chemical Ali',Ali Hassan al-Majid - widely known as Chemical...,World,"[iraqi, judges, quiz, chemical, ali, ali, hass...","[101, 8956, 6794, 19461, 1005, 5072, 4862, 100...",0


In [67]:
# Since the goal of this exercise if to identify category based on headline and short description, 
# we choose to merge them, as the vectorizer functions can't process multiple columns

df_selected['joined'] = df_selected.apply(lambda row: " ".join(row['berttoken']) , axis=1)
X = df_selected['joined']
y = df_selected['cat_enc']

['wall', 'pull', 'reflects', 'tech', 'blow', 'reuters', 'reuters', 'wall', 'street', 'long', 'playing', 'drama', 'waiting', 'for', 'google', 'about', 'reach', 'its', 'final', 'act', 'but', 'its', 'stock', 'market', 'debut', 'ending', 'more', 'nostalgia', 'event', 'than', 'the', 'catalyst', 'for', 'new', 'era']


In [69]:
X.describe()
X.head()

0    wall pull reflects tech blow reuters reuters w...
1    wall bears claw back into the black reuters re...
2    carly looks toward commercial aerospace reuter...
3    oil and economy cloud stocks outlook reuters r...
4    iraq halt oil exports from main southern pipel...
Name: joined, dtype: object

In [70]:
from sklearn.model_selection import train_test_split

# Split the data into 70-30 i.e. test size of 30% to check the accuracy of the training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle = True)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=42, shuffle = True)

#Let's check the shape of the splitted data
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (194877,)
Testing Data Shape: (64960,)


In [71]:
from sklearn.feature_extraction.text import CountVectorizer
# Let's first try with Count Vectorizer from scikit learn
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_train_cv.shape

(194877, 20014)

Doing a test training with a non-deep learning model, as a test

In [72]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_cv,y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [73]:
# Let's test it for the first 2 articles in the Test dataset
X_test1 = X_test[0:2]
print(X_test1)

470005    least killed philippines blast police investig...
268333    wine sparkle dull shares australian wine produ...
Name: joined, dtype: object


In [74]:
X_test1_cv = cv.transform(X_test1)
clf.predict(X_test1_cv)

array([1, 3])

In [75]:
# Transform the test data before predicting
X_test_cv = cv.transform(X_test)

In [76]:
# Form a prediction set
predictions = clf.predict(X_test_cv)

In [77]:
import sklearn.metrics as metrics
# Report the confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

[[12532  1479   497   824  1024]
 [ 3710  4471  1495  1491  2917]
 [  255   628 10904   100   483]
 [  520   702   118  9092   466]
 [ 3515  3489  1414  1288  1546]]
              precision    recall  f1-score   support

           0       0.61      0.77      0.68     16356
           1       0.42      0.32      0.36     14084
           2       0.76      0.88      0.81     12370
           3       0.71      0.83      0.77     10898
           4       0.24      0.14      0.17     11252

    accuracy                           0.59     64960
   macro avg       0.55      0.59      0.56     64960
weighted avg       0.55      0.59      0.56     64960

0.5933651477832512


In [78]:
y_train.value_counts()

0    48831
1    42547
2    37328
4    33523
3    32648
Name: cat_enc, dtype: int64

In [None]:
#remove all files 
import shutil
#shutil.rmtree("data")
#os.remove('newsspace200.xml.bz')
shutil.rmtree('data')