# Text classification  on Reuters dataset

In [2]:
import re
import xml.sax.saxutils as saxutils
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop=stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karchaud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\karchaud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\karchaud\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Setting env variables


In [3]:
# Newsline folder and format
data_folder = 'reuters21578/'

sgml_number_of_files = 22
sgml_file_name_template = 'reut2-NNN.sgm'


# Run it First time only

## Prepare documents and categories

In [None]:
# New experiment

# Parse SGML files
document_X = {}
document_Y = {}

df_temp_load = pd.DataFrame(columns=['newid','lewissplit','topic','body'])

def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

numberOfRows = 0
# Iterate all files
for i in range(sgml_number_of_files):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    print('Reading file: %s' % file_name)

    with open(data_folder + file_name, 'r') as file:
        content = BeautifulSoup(file.read().lower())
        
        for index, newsline in enumerate(content('reuters')):
            document_categories = []
            
            # News-line Id
            document_id = newsline['newid']
            lewissplit  =newsline['lewissplit']
            # News-line text
            document_body = strip_tags(str(newsline('text')[0])).replace('reuter\n &#3;', '')
            document_body = unescape(document_body)
            
            # News-line categories
#            topics = newsline.topics.contents
            topics1 = strip_tags(str(newsline.topics.contents))
            #print(numberOfRows,  index, '=', numberOfRows + index)
            df_temp_load.loc[numberOfRows + index] = [document_id]+  [lewissplit] + [topics1]+ [document_body] 
        numberOfRows = df_temp_load.index.max() + 1


In [None]:
#Storing raw parsed data in a pickle
df_temp_load.to_pickle('ParsedData.pickle')

In [None]:
df_temp_load.count()

# Start from here second time onwards

In [5]:
df_temp = pd.read_pickle('ParsedData.pickle')

In [6]:
df_temp.count()

newid         21578
lewissplit    21578
topic         21578
body          21578
dtype: int64

In [7]:
# Converting string to list, Column looks like having list but is a string
df_temp['topic'] = df_temp['topic'].str.strip('()').str.split(',')

In [8]:
df_temp.count()

newid         21578
lewissplit    21578
topic         21578
body          21578
dtype: int64

In [9]:
df_temp[df_temp['body'].str.contains("national average prices for ")]

Unnamed: 0,newid,lewissplit,topic,body
4,5,train,"[[grain, wheat, corn, barley, oat, sorghum]]",national average prices for farmer-owned reser...
13798,13799,train,[[]],national average prices for farmer-owned reser...
14485,14486,train,[[]],national average prices for farmer-owned reser...
15951,15952,test,"[[grain, wheat, corn, barley, oat, sorghum]]",national average prices for farmer-owned reser...


# Melting the Topic category

In [10]:
df_temp =   df_temp.topic.apply(pd.Series) \
            .merge(df_temp, right_index = True, left_index = True) \
            .drop(["topic"], axis = 1) \
            .melt(id_vars = ['newid', 'lewissplit', 'body'], value_name = "topic") \
            .drop("variable", axis = 1) \
            .dropna()

df_temp['topic'] = df_temp['topic'].str.replace('[','').str.replace(']','')

In [11]:
display(df_temp.head())
df_temp.count()

Unnamed: 0,newid,lewissplit,body,topic
0,1,train,"bahia cocoa review\n salvador, feb 26 - sho...",cocoa
1,2,train,standard oil <srd> to form financial unit\n ...,
2,3,train,texas commerce bancshares <tcb> files plan\n ...,
3,4,train,talking point/bankamerica <bac> equity offer\n...,
4,5,train,national average prices for farmer-owned reser...,grain


newid         24513
lewissplit    24513
body          24513
topic         24513
dtype: int64

In [12]:
#replacing blank cells with nan for removing them laters
df_temp.replace('',np.nan,inplace=True)
df_temp.isna().sum()

newid             0
lewissplit        0
body              1
topic         10211
dtype: int64

In [13]:
df_temp.dropna(inplace=True)
df_temp.count()

newid         14302
lewissplit    14302
body          14302
topic         14302
dtype: int64

# Cleaning text

In [14]:
#Stop word removal
df_temp['body'] = df_temp['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [15]:
# Removing tag and text between []
df_temp['body'] =  [re.sub("[\<\[].*?[\>\]]",'', str(x)) for x in df_temp['body']]

In [16]:
# remving unwanted characters and punctuations
df_temp['body'] = df_temp['body'].str.replace(r'[^\w\s]+', '')

In [18]:
# Removing numbers.
df_temp['body'] = df_temp['body'].str.replace('\d+', '')

# Tokenize and Lemmenatize

In [19]:
def tokenize(text):
    tokenized = word_tokenize(text)
    no_punc = []
    for review in tokenized:
        line = "".join(char for char in review)
        no_punc.append(line)
    tokens = lemmatize(no_punc)
    return tokens


def lemmatize(tokens):
    lmtzr = WordNetLemmatizer()
    lemma = [lmtzr.lemmatize(t) for t in tokens]
    return lemma

In [20]:
df_temp['body'] = df_temp['body'].apply(lambda x: tokenize(x))

In [21]:
# train test split based on column lewissplit

df_temp['body'] = df_temp['body'].apply(', '.join)
train_split= df_temp[df_temp['lewissplit']=='train']
test_split= df_temp [df_temp['lewissplit']=='test']
display(train_split.count())
display(test_split.count())

newid         9656
lewissplit    9656
body          9656
topic         9656
dtype: int64

newid         3752
lewissplit    3752
body          3752
topic         3752
dtype: int64

In [22]:
train_split.tail()

Unnamed: 0,newid,lewissplit,body,topic
237592,235,train,"indonesian, agriculture, growth, expected, slo...",coffee
259170,235,train,"indonesian, agriculture, growth, expected, slo...",tea
280748,235,train,"indonesian, agriculture, growth, expected, slo...",plywood
302326,235,train,"indonesian, agriculture, growth, expected, slo...",soy-meal
323904,235,train,"indonesian, agriculture, growth, expected, slo...",cotton


# Creating pipeline for grid search with cross validation = 5 and using tf-idf and finding best hyper parameters 

In [None]:
%%time

#For all data 
pipeline = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
                     LogisticRegression())
tfidf_param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

tfidf_grid = GridSearchCV(pipeline, tfidf_param_grid, cv=5)
tfidf_grid.fit(train_split['body'], train_split['topic'])

print("Best cross-validation score: {:.3f}".format(tfidf_grid.best_score_))
print("Best parameters: ", tfidf_grid.best_params_)

In [None]:
predict_ = tfidf_grid.predict(test_split_top10['body'])
accuracy_score(test_split['topic'], predict_, normalize=True)

# Testing with top 10 frequent topics


In [23]:
x = df_temp.topic.value_counts().head(10).index.tolist()
df_temp_top10 = df_temp[df_temp.topic.isin(x)]
df_temp_top10.count()

newid         9707
lewissplit    9707
body          9707
topic         9707
dtype: int64

In [24]:
train_split_top10= df_temp_top10[df_temp_top10['lewissplit']=='train']
test_split_top10= df_temp_top10[df_temp_top10['lewissplit']=='test']
display(train_split_top10.count())
display(test_split_top10.count())

newid         6721
lewissplit    6721
body          6721
topic         6721
dtype: int64

newid         2621
lewissplit    2621
body          2621
topic         2621
dtype: int64

In [25]:
train_split_top10.head()

Unnamed: 0,newid,lewissplit,body,topic
4,5,train,"national, average, price, farmerowned, reserve...",grain
8,9,train,"champion, product, approves, stock, split, roc...",earn
9,10,train,"computer, terminal, system, completes, sale, c...",acq
10,11,train,"cobanco, inc, year, net, santa, cruz, calif, f...",earn
11,12,train,"ohio, mattress, may, lower, st, qtr, net, clev...",earn


In [47]:
%%time
pipeline_top10 = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
                     LogisticRegression(multi_class='ovr'))
tfidf_param_grid_top10 = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

tfidf_grid_top10 = GridSearchCV(pipeline_top10, tfidf_param_grid_top10, cv=5)
tfidf_grid_top10.fit(train_split_top10.body, train_split_top10.topic)

print("Best cross-validation score: {:.3f}".format(tfidf_grid_top10.best_score_))
print("Best parameters: ", tfidf_grid_top10.best_params_)

predict_top10 = tfidf_grid_top10.predict(test_split_top10['body'])
accuracy_score(test_split_top10['topic'], predict_top10, normalize=True)
print('accuracy score on test data',accuracy_score)



Best cross-validation score: 0.870
Best parameters:  {'logisticregression__C': 0.01}
accuracy score on test data <function accuracy_score at 0x0000012E31FCF840>
Wall time: 1min 11s


In [43]:
predict_top10 = pipeline_top10.predict(test_split_top10['body'])

NameError: name 'pipeline_top10' is not defined

In [None]:
test_split_top10.count()

In [None]:
accuracy_score(test_split_top10['topic'], predict_top10, normalize=True)

# Testing with top 20 frequent topics


In [39]:
x = df_temp.topic.value_counts().head(20).index.tolist()
df_temp_top20 = df_temp[df_temp.topic.isin(x)]
df_temp_top20.count()

newid         11104
lewissplit    11104
body          11104
topic         11104
dtype: int64

In [40]:
train_split_top20= df_temp_top20[df_temp_top20['lewissplit']=='train']
test_split_top20= df_temp_top20[df_temp_top20['lewissplit']=='test']
display(train_split_top20.count())
display(test_split_top20.count())

newid         7676
lewissplit    7676
body          7676
topic         7676
dtype: int64

newid         2906
lewissplit    2906
body          2906
topic         2906
dtype: int64

In [41]:
%%time
pipeline_top20 = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
                     LogisticRegression())
tfidf_param_grid_top20 = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

tfidf_grid_top20 = GridSearchCV(pipeline_top20, tfidf_param_grid_top20, cv=5)
tfidf_grid_top20.fit(train_split_top20.body, train_split_top20.topic)

print("Best cross-validation score: {:.3f}".format(tfidf_grid_top20.best_score_))
print("Best parameters: ", tfidf_grid_top20.best_params_)



Best cross-validation score: 0.805
Best parameters:  {'logisticregression__C': 0.001}
Wall time: 3min 3s


In [42]:
predict_top20 = tfidf_grid_top20.predict(test_split_top20['body'])

NameError: name 'tfidf_grid_top20' is not defined

In [None]:
accuracy_score(test_split_top20['topic'], predict_top20, normalize=True)