In [162]:
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet')
import string as s
from nltk.corpus import stopwords
# import matplotlib.pyplot as plt
%matplotlib inline
# import seaborn as sns
import re
import os

[nltk_data] Downloading package wordnet to /home/bek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [163]:
# Read only the specified number of rows
train = pd.read_csv('../../data/ag-news/train.csv', nrows=25000)
test = pd.read_csv('../../data/ag-news/test.csv', nrows=5000)

In [164]:
train.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [165]:
test.head()

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [166]:
print(train.shape)
print(test.shape)

(25000, 3)
(5000, 3)


In [167]:
train_x = train.Description
test_x = test.Description

train_y =  train['Class Index']
test_y = test['Class Index']

In [168]:
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

In [169]:
train_x = train_x.apply(striphtml)
test_x = test_x.apply(striphtml)

In [170]:
def remove_url(data):
    return  re.sub(r'\s*(?:https?://)?www\.\S*\.[A-Za-z]{2,5}\s*', ' ', data).strip()

In [171]:
train_x = train_x.apply(remove_url)
test_x = test_x.apply(remove_url)

Tokenization of Data

In [172]:
# from nltk.tokenize import word_tokenize

def word_tok(data):
    tokens = re.findall("[\w']+", data)
    return tokens

In [173]:
train_x = train_x.apply(word_tok)
test_x =test_x.apply(word_tok)

Removal of stopwords

In [174]:
def remove_stopwords(data):
    stopWords = stopwords.words('english')
    new_list = []
    for i in data:
        if i.lower() not in stopWords:
            new_list.append(i)
    return new_list

In [175]:
train_x = train_x.apply(remove_stopwords)
test_x = test_x.apply(remove_stopwords)

Removal of Punctuation Symbols

In [176]:
def remove_punctuations(data):
    new_list = []
    for i in data:
        for j in s.punctuation:
            i = i.replace(j,'')
        new_list.append(i)
    return new_list

In [177]:
train_x = train_x.apply(remove_punctuations)
test_x = test_x.apply(remove_punctuations)

Removal of numbers

In [178]:
def remove_number(data):
    no_digit_list = []
    new_list = []

    for i in data:
        for j in s.digits:
            i = i.replace(j,'')
        no_digit_list.append(i)

    for i in no_digit_list:
        if i!='':
            new_list.append(i)
    return new_list

In [179]:
train_x = train_x.apply(remove_number)
test_x = test_x.apply(remove_number)

Lemmatization of data

In [180]:
def lemmatization(data):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    roots = [lemmatizer.lemmatize(i) for i in data]
    return roots

In [181]:
train_x = train_x.apply(lemmatization)
test_x = test_x.apply(lemmatization)

Remove extra words

In [182]:
def remove_extraWords(data):
    extra_words =['href','iii','lt','gt','ii','com','quot']

    new_list = []
    for i in data:
        if i not in extra_words:
            new_list.append(i)
    return new_list

In [183]:
train_x = train_x.apply(remove_extraWords)
test_x = test_x.apply(remove_extraWords)

In [184]:
train_x = [" ".join(map(str, lst)) for lst in train_x]
test_x = [" ".join(map(str, lst)) for lst in test_x]

Step 4: Feature Extraction

In [185]:
# Feature are extracted from the datset and TF-IDF (term frequency - Inverse Documents Frequence ) is used for this purpose
from sklearn.feature_extraction.text import TfidfVectorizer

TfIdf = TfidfVectorizer(min_df=8,ngram_range=(1,3))
train_1 = TfIdf.fit_transform(train_x)
test_1 = TfIdf.transform(test_x)

print("no of features extracted")
print(len(TfIdf.get_feature_names_out()))
print(TfIdf.get_feature_names_out()[:100])

no of features extracted
13742
['aa' 'aapl' 'aaron' 'aaron peirsol' 'ab' 'abandon' 'abandoned' 'abbey'
 'abbey national' 'abby' 'abby wambach' 'abc' 'abducted' 'abdul'
 'abdullah' 'ability' 'able' 'aboard' 'abortion' 'abroad' 'abrupt'
 'abruptly' 'absence' 'abu' 'abu ghraib' 'abu ghraib prison' 'abu hamza'
 'abuja' 'abuse' 'abuse scandal' 'abusing' 'abusing iraqi' 'ac' 'academic'
 'academy' 'accelerate' 'accelerated' 'accelerating' 'accept' 'accepted'
 'access' 'access brokerage' 'accessing' 'accident' 'acclaim'
 'acclaim entertainment' 'accord' 'according' 'according company'
 'according data' 'according latest' 'according new'
 'according new report' 'according new study' 'according report'
 'according report released' 'according research' 'according statement'
 'according study' 'according study released' 'according survey' 'account'
 'accounting' 'accounting firm' 'accounting practice' 'accounting scandal'
 'accurate' 'accusation' 'accuse' 'accused' 'accused plotting' 'accuser'
 'a

In [186]:
train_x = train_1.toarray()
test_x = test_1.toarray()

In [187]:
train_x.shape

(25000, 13742)

In [188]:
test_x.shape

(5000, 13742)

In [189]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)

# Convert Series to NumPy array before reshaping
train_y = encoder.fit_transform(train_y.to_numpy().reshape(-1, 1))
test_y = encoder.transform(test_y.to_numpy().reshape(-1, 1))


In [190]:
train_y.shape

(25000, 4)

In [191]:
test_y.shape

(5000, 4)

In [192]:
# Save training data
np.save('train_x.npy', train_x)
np.save('train_y.npy', train_y)

# Save test data
np.save('test_x.npy', test_x)
np.save('test_y.npy', test_y)