<a href="https://colab.research.google.com/github/krisivarga/deep_learning_HW_big/blob/Kristof_solve/Copy_of_get_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

As the first task, we download the given dataset from   http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html . We chose the xml format, as it seems to be better structured and has fewer invalid rows.

In [None]:
#download file from web

import requests, zipfile, io, os

zip_file_url = "http://groups.di.unipi.it/~gulli/newsspace200.xml.bz"

filename = zip_file_url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(zip_file_url)
    f.write(r.content)

In [None]:
#create data folder, decompress data

import bz2,shutil

dirName = 'data'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")
    
    
with bz2.BZ2File("newsspace200.xml.bz") as fr, open("./data/input.xml","wb") as fw:
    shutil.copyfileobj(fr,fw)

Directory  data  already exists


In [None]:
import xml.etree.ElementTree as ET

tree = ET.parse("./data/input.xml")
root = tree.getroot()

titles = []
categories = []
descriptions = []
sources = []


for title in tree.findall('title'):
    titles.append(title.text)
    
for category in tree.findall('category'):
        categories.append(category.text)

for description in tree.findall('description'):
    descriptions.append(description.text)

for source in tree.findall('source'):
    sources.append(source.text)


In [None]:
import numpy as np

In [None]:
titles = np.array(titles).T
categories = np.array(categories).T
descriptions = np.array(descriptions).T
sources = np.array(sources).T

In [None]:
data = np.dstack((sources,titles,descriptions,categories)).reshape(len(titles),4)

In [None]:
data.shape

(496835, 4)

In [None]:
import pandas as pd

df = pd.DataFrame(data)
df.columns = ['source','title', 'desc', 'cat']

In [None]:
print(f"Total unique categories are: {len(df['cat'].value_counts())}")
print(f"Count of occurance of each category:")
df['cat'].value_counts()

Total unique categories are: 17
Count of occurance of each category:


World                                                                                                                                                                                       81456
Entertainment                                                                                                                                                                               70892
Sports                                                                                                                                                                                      62163
Business                                                                                                                                                                                    56656
Top Stories                                                                                                                                                                                 56045
Sci/Tech                      

In [None]:
selected_cats = df['cat'].value_counts()[:5].index.tolist()
print(selected_cats)

df_selected = df.loc[df['cat'].isin(selected_cats)]

print(df_selected)

['World', 'Entertainment', 'Sports', 'Business', 'Top Stories']
                       source  ...       cat
0              Yahoo Business  ...  Business
1              Yahoo Business  ...  Business
2              Yahoo Business  ...  Business
3              Yahoo Business  ...  Business
4              Yahoo Business  ...  Business
...                       ...  ...       ...
496829  New York Times sports  ...    Sports
496830         BBC News world  ...     World
496831  New York Times sports  ...    Sports
496832         BBC News world  ...     World
496833  New York Times sports  ...    Sports

[327212 rows x 4 columns]


In [None]:
print(f"Total unique categories are: {len(df_selected['cat'].value_counts())}")
print(f"Count of occurance of each category:")
df_selected['cat'].value_counts()

Total unique categories are: 5
Count of occurance of each category:


World            81456
Entertainment    70892
Sports           62163
Business         56656
Top Stories      56045
Name: cat, dtype: int64

In [None]:
df_selected.isnull().sum()

source       0
title        0
desc      2415
cat          0
dtype: int64

In [None]:
df_selected = df_selected.dropna()

In [None]:
df_selected.isnull().sum()

source    0
title     0
desc      0
cat       0
dtype: int64

In [None]:
# Check of spaces in column headline - using enumerate
spaces = []
for i, x in enumerate(df_selected['title']):
    if type(x) == str:
        if x.isspace():
            spaces.append(i)
        
print(len(spaces), 'spaces in index: ', spaces)

0 spaces in index:  []


In [None]:
# Check of spaces in column short desc - using itertuples
blanks = []  # start with an empty list

for i,sou,tit,desc,cat in df_selected.itertuples():  # iterate over the DataFrame
    if type(desc)==str:            # avoid NaN values
        if desc.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


Although it is part of the database, we are not going to use the 'source' field. When we print the value_counts for that field joined with the categories, it's obvious that the category can be found in the 'source' most of the time, which defeats the point of this homework.

In [None]:
print(df_selected['source'].value_counts())
print(df_selected.value_counts(subset=['source','cat'])[:30])
print(df_selected.value_counts(subset=['source','cat'])[100:130])

Reuters                        15935
Yahoo World                    11193
RedNova general                 7855
Reuters Business                6897
Yahoo Sports                    6850
                               ...  
Mt. Vernon Register News           1
SuperMizzou                        1
Rome News-Tribune                  1
The Hammer                         1
Internet Telephony Magazine        1
Name: source, Length: 3053, dtype: int64
source                      cat          
Yahoo World                 World            11193
RedNova general             World             7855
Reuters Business            Business          6897
Yahoo Sports                Sports            6850
Yahoo Entertainment         Entertainment     6741
Reuters World               World             6106
BBC News world              World             6038
Yahoo Politics              World             4586
New York Times sports       Sports            4361
Reuters                     Top Stories       3710


Next, we define tokenization and vocabulary building. We're using the WordPiece based BERT tokenizer from huggingface.co. We also tried the SentencePie based XLNetTokenizer, but the results weren't as good as with the BERT tokenizer. The methods that are defined here will be applied to every line of the dataframe.  
We decided on one method, which tokenizes the title and the description and after that, it removes the tokens that contain non-aplhanumeric characters (except padding (# in BERT) or a few other characters). We do this, because some descriptions contain obviously unwanted characters, like '\\'. We build the vocabualry from these tokens.  
The other method uses the encode_plus function which automatically 


In [None]:
!pip install sentencepiece
!pip install transformers
from transformers import XLNetTokenizer
from transformers import BertTokenizer
import re

bertregex = re.compile('^[a-zA-Z"]+$')
xlregex = re.compile('^[a-zA-Z0-9\_\-\'"]+$')

xltokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

btokenizer = BertTokenizer.from_pretrained("bert-base-uncased")





In [None]:
bertvoc = []
bertidvoc= []

def tr(bt,reg,voc,idvoc,title,sz):
  a = bt.tokenize(title + " " + sz)
  r = []
  for s in a:
    if reg.match(s) is not None:     
      r.append(s)
  voc.extend(r)
  return r

df_selected['bertencode'] = df_selected.apply(lambda row: tr(btokenizer, bertregex,bertvoc,bertidvoc, str(row['title']), str(row['desc'])), axis=1)
print(df_selected)

                       source  ...                                         bertencode
0              Yahoo Business  ...  [wall, st, pull, reflects, tech, blow, reuters...
1              Yahoo Business  ...  [wall, st, bears, claw, back, into, the, black...
2              Yahoo Business  ...  [carly, looks, toward, commercial, aerospace, ...
3              Yahoo Business  ...  [oil, and, economy, cloud, stocks, outlook, re...
4              Yahoo Business  ...  [iraq, halt, oil, exports, from, main, souther...
...                       ...  ...                                                ...
496829  New York Times sports  ...  [high, on, priority, list, home, improvement, ...
496830         BBC News world  ...  [compromise, seals, climate, meeting, a, clima...
496831  New York Times sports  ...  [e, enjoying, his, point, of, view, howard, e,...
496832         BBC News world  ...  [iraqi, judges, quiz, chemical, ali, ali, hass...
496833  New York Times sports  ...  [nets, get, carter

In [None]:
bertencidvoc = []
def bertenc(bt,voc,title,sz):
  text_to_encode = title + " " + sz
  a = bt.encode_plus(
            text_to_encode, 
            max_length= 512, 
            add_special_tokens=True,
            return_token_type_ids=False, 
            padding="max_length",
            truncation = True,
            return_attention_mask=False
        )
  ids = a['input_ids']
  voc.extend(ids)
  return ids

df_selected['bertencodeplus'] = df_selected.apply(lambda row: bertenc(btokenizer, bertencidvoc, str(row['title']), str(row['desc'])), axis=1)

#df_selected['xl'] = df_selected.apply(lambda row: tr(xltokenizer, xlregex,xlvoc, str(row['title']), str(row['desc'])), axis=1)
df_selected

Unnamed: 0,source,title,desc,cat,bertencode,bertencodeplus
0,Yahoo Business,Wall St. Pullback Reflects Tech Blowout (Reuters),"Reuters - Wall Street's long-playing drama,\""W...",Business,"[wall, st, pull, reflects, tech, blow, reuters...","[101, 2813, 2358, 1012, 4139, 5963, 11138, 662..."
1,Yahoo Business,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Business,"[wall, st, bears, claw, back, into, the, black...","[101, 2813, 2358, 1012, 6468, 15020, 2067, 204..."
2,Yahoo Business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Business,"[carly, looks, toward, commercial, aerospace, ...","[101, 18431, 2571, 3504, 2646, 3293, 13395, 10..."
3,Yahoo Business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Business,"[oil, and, economy, cloud, stocks, outlook, re...","[101, 3514, 1998, 4610, 6112, 15768, 1005, 176..."
4,Yahoo Business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Business,"[iraq, halt, oil, exports, from, main, souther...","[101, 5712, 9190, 2015, 3514, 14338, 2013, 236..."
...,...,...,...,...,...,...
496829,New York Times sports,High on priority list: Home improvement,Doc Rivers knows any postseason plans hinge on...,Sports,"[high, on, priority, list, home, improvement, ...","[101, 2152, 2006, 9470, 2862, 1024, 2188, 7620..."
496830,BBC News world,Compromise seals climate meeting,A climate conference overcomes last-minute obj...,World,"[compromise, seals, climate, meeting, a, clima...","[101, 12014, 13945, 4785, 3116, 1037, 4785, 30..."
496831,New York Times sports,Eisley enjoying his point of view,Howard Eisley has fond memories of Boston. He ...,Sports,"[e, enjoying, his, point, of, view, howard, e,...","[101, 1041, 2483, 3051, 9107, 2010, 2391, 1997..."
496832,BBC News world,Iraqi judges quiz 'Chemical Ali',Ali Hassan al-Majid - widely known as Chemical...,World,"[iraqi, judges, quiz, chemical, ali, ali, hass...","[101, 8956, 6794, 19461, 1005, 5072, 4862, 100..."


In [None]:
df_selected.at[0, 'bertencodeplus']
print(type(df_selected.at[0, 'bertencodeplus']))

<class 'torch.Tensor'>


In [None]:
wdf = pd.DataFrame(bertidvoc)
wdf.columns = ['id']
print(wdf)
wdf['word'] = wdf.apply(lambda row: btokenizer.convert_ids_to_tokens(row['id'].item()) , axis=1)


#print(df_selected.at[0,'bertregex'])
print(wdf.value_counts()[:30])

             id
0          2813
1          2358
2          1012
3          4139
4          5963
...         ...
16120608   2461
16120609   4433
16120610  11214
16120611   7483
16120612   1012

[16120613 rows x 1 columns]
id    word
1996  the     570053
1012  .       556314
1010  ,       447633
1011  -       342588
2000  to      314317
1037  a       311593
1999  in      288246
1997  of      279441
1025  ;       199943
1998  and     184938
1055  s       167212
2006  on      155590
1001  #       152843
4464  39      149113
2005  for     133895
1006  (        88343
1007  )        87552
2015  ##s      87493
2004  as       74364
2012  at       74136
2008  that     73026
2007  with     71367
1005  '        70370
2003  is       55944
2011  by       55901
2056  said     54886
2010  his      53739
1024  :        53407
2038  has      51516
2149  us       51281
dtype: int64


In [None]:
i = 0
category_dictionary = []
for s in selected_cats:
  category_dictionary.append(tuple((i,s)))
  i = i + 1
print(category_dictionary)

[(0, 'World'), (1, 'Entertainment'), (2, 'Sports'), (3, 'Business'), (4, 'Top Stories')]


In [None]:
# Since the goal of this exercise if to identify category based on headline and short description, 
# we choose to merge them, as the vectorizer functions can't process multiple columns
X = df_selected['title']+df_selected['desc']
y = df_selected['cat']

In [None]:
X.describe()

count                                                324797
unique                                               221234
top       ADV: Try Currency Trading Risk-Free 30 Days24-...
freq                                                     60
dtype: object

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into 70-30 i.e. test size of 30% to check the accuracy of the training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle = True)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=42, shuffle = True)

#Let's check the shape of the splitted data
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (227357,)
Testing Data Shape: (97440,)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Let's first try with Count Vectorizer from scikit learn
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_train_cv.shape

(227357, 160531)

Doing a test training with a non-deep learning model, as a test

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_cv,y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
# Let's test it for the first 2 articles in the Test dataset
X_test1 = X_test[0:2]
print(X_test1)

470005    At Least 13 Killed in Philippines BlastPolice ...
268333    Wine sparkles in dull bourseSHARES in Australi...
dtype: object


In [None]:
X_test1_cv = cv.transform(X_test1)
clf.predict(X_test1_cv)

array(['Entertainment', 'Business'], dtype=object)

In [None]:
# Transform the test data before predicting
X_test_cv = cv.transform(X_test)

In [None]:
# Form a prediction set
predictions = clf.predict(X_test_cv)

In [None]:
import sklearn.metrics as metrics
# Report the confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

[[12922  1407   113  1178   827]
 [ 1661  5585  1838  8191  3852]
 [  137  1398 15316  1321   372]
 [ 1465  8344  1769  1679  3570]
 [  849  2891   479  2820 17456]]
               precision    recall  f1-score   support

     Business       0.76      0.79      0.77     16447
Entertainment       0.28      0.26      0.27     21127
       Sports       0.78      0.83      0.80     18544
  Top Stories       0.11      0.10      0.10     16827
        World       0.67      0.71      0.69     24495

     accuracy                           0.54     97440
    macro avg       0.52      0.54      0.53     97440
 weighted avg       0.53      0.54      0.53     97440

0.5434934318555008


In [None]:
y_train.value_counts()

World            56804
Entertainment    49744
Sports           43607
Top Stories      39217
Business         37985
Name: cat, dtype: int64

In [None]:
#remove all files 
import shutil

os.remove('newsspace200.xml.bz')
shutil.rmtree('data')