In [5]:
# import library
import pandas as pd
import os

# get current working directory
cwd = os.getcwd()

# set dataset directory
dwd = '/data/ag_dataset/'

# import AG dataset
data = pd.read_csv(cwd + dwd + 'train.csv')
data = pd.DataFrame(data=data)

# slightly clean column name
data.columns = data.columns.str.replace(" ", "_")
data.columns = data.columns.str.lower()

# create new column 'class_name' using 'class_index' to map with new description
data['class_name'] = data['class_index'].map({
	1: 'world', 2: 'sport', 3: 'business', 4: 'sci_tech'
})

# display data
data

Unnamed: 0,class_index,title,description,class_name
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",business
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,business
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,business
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,business
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",business
...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,world
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,sport
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,sport
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,sport


In [6]:
# observe data by class
data.class_name.value_counts()

business    30000
sci_tech    30000
sport       30000
world       30000
Name: class_name, dtype: int64

In [7]:
# view titles
for i in range(10):
	print("title of article", i)
	# loc = location [row, column]
	print(data.loc[i, "title"], '\n')

title of article 0
Wall St. Bears Claw Back Into the Black (Reuters) 

title of article 1
Carlyle Looks Toward Commercial Aerospace (Reuters) 

title of article 2
Oil and Economy Cloud Stocks' Outlook (Reuters) 

title of article 3
Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) 

title of article 4
Oil prices soar to all-time record, posing new menace to US economy (AFP) 

title of article 5
Stocks End Up, But Near Year Lows (Reuters) 

title of article 6
Money Funds Fell in Latest Week (AP) 

title of article 7
Fed minutes show dissent over inflation (USATODAY.com) 

title of article 8
Safety Net (Forbes.com) 

title of article 9
Wall St. Bears Claw Back Into the Black 



In [8]:
# view descriptions
for i in range(10):
	print('description of article', i)
	print(data.loc[i, 'description'], '\n')

description of article 0
Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again. 

description of article 1
Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market. 

description of article 2
Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums. 

description of article 3
Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday. 

description of article 4
AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections. 

description o

In [10]:
# preprocessing
# clean up text
cols = ['title', 'description']

# define preprocess functions
a = lambda x: x.replace('\\', ' ')
b = lambda x: x.replace('#36', '$')
c = lambda x: x.replace('  ', ' ')
d = lambda x: x.strip()

# apply preprocess functions for each element
data[cols] = data[cols].applymap(a)
data[cols] = data[cols].applymap(b)
data[cols] = data[cols].applymap(c)
data[cols] = data[cols].applymap(d)

# write data to csv w/o index (no. 1 - n)
data.to_csv(cwd + dwd + 'train_prepared.csv', index=False)

In [14]:
# import spacy and load language model
import spacy
i = spacy.require_cpu()
print(i)
nlp = spacy.load('en_core_web_sm')

True


In [15]:
# view metadata of the model
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(nlp.meta)

{   'author': 'Explosion',
    'components': [   'tok2vec',
                      'tagger',
                      'parser',
                      'senter',
                      'attribute_ruler',
                      'lemmatizer',
                      'ner'],
    'description': 'English pipeline optimized for CPU. Components: tok2vec, '
                   'tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
    'disabled': ['senter'],
    'email': 'contact@explosion.ai',
    'labels': {   'attribute_ruler': [],
                  'lemmatizer': [],
                  'ner': [   'CARDINAL',
                             'DATE',
                             'EVENT',
                             'FAC',
                             'GPE',
                             'LANGUAGE',
                             'LAW',
                             'LOC',
                             'MONEY',
                             'NORP',
                             'ORDINAL',
                    