In [31]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [32]:
df = pd.read_csv('data_files/topics_and_text.csv')

In [33]:
df.head(10)

Unnamed: 0,topic,text
0,other,"BAHIA COCOA REVIEW SALVADOR, Feb 26 - Showers ..."
1,blank,STANDARD OIL <SRD> TO FORM FINANCIAL UNIT CLEV...
2,blank,TEXAS COMMERCE BANCSHARES <TCB> FILES PLAN HOU...
3,blank,TALKING POINT/BANKAMERICA <BAC> EQUITY OFFER b...
4,other,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESER...
5,other,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS ...
6,blank,"RED LION INNS FILES PLANS OFFERING PORTLAND, O..."
7,blank,"USX <X> DEBT DOWGRADED BY MOODY'S NEW YORK, Fe..."
8,earn,CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT RO...
9,other,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SAL...


In [34]:
df.loc[8].text

'CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT ROCHESTER, N.Y., Feb 26 - Champion Products Inc said its board of directors approved a two-for-one stock split of its common shares for shareholders of record as of April 1, 1987. The company also said its board voted to recommend to shareholders at the annual meeting April 23 an increase in the authorized capital stock from five mln to 25 mln shares. Reuter'

In [35]:
df.loc[1003].topic

'blank'

In [36]:
len(df)

22999

## Loop through texts to remove stopwords, tokenize and lemmatize

In [37]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list.append('...')



In [38]:
def process_text(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed

# function to concatenate a list of words into a single, space-separated string (used in function below)
def concat_words(list_of_words):

    concat_words = ''
    for word in list_of_words:
        concat_words += word + ' '
    return concat_words.strip()

# function to lemmatize words and merge each text into a single, space-separated string

lemm = WordNetLemmatizer()

def make_lemma_and_concat(list_of_words):
    # remove any NaN's
    list_of_words = [i for i in list_of_words if i is not np.nan]
    
    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(list_of_words):
        lemmatized_list.append(lemm.lemmatize(word))
    
    # make the list into a single string 
    concatenated_string = concat_words(lemmatized_list)
    return concatenated_string

## Prepare dataframe for modeling

In [39]:
for i in range(len(df)):
    processed_text = process_text(df['text'].loc[i])
    txt = make_lemma_and_concat(processed_text)
    df['text'].loc[i] = txt
    if i % 3000 == 0:
        print(f'Finished line number {i}')
df.head()

Finished line number 0
Finished line number 3000
Finished line number 6000
Finished line number 9000
Finished line number 12000
Finished line number 15000
Finished line number 18000
Finished line number 21000


Unnamed: 0,topic,text
0,other,bahia cocoa review salvador feb shower continu...
1,blank,standard oil srd form financial unit cleveland...
2,blank,texas commerce bancshares tcb file plan housto...
3,blank,talking bac equity offer janie gabbett reuters...
4,other,national average price reserve washington feb ...


In [40]:
df.head(10)

Unnamed: 0,topic,text
0,other,bahia cocoa review salvador feb shower continu...
1,blank,standard oil srd form financial unit cleveland...
2,blank,texas commerce bancshares tcb file plan housto...
3,blank,talking bac equity offer janie gabbett reuters...
4,other,national average price reserve washington feb ...
5,other,argentine registration buenos aire feb argenti...
6,blank,red lion inn file plan offering portland feb r...
7,blank,usx x debt dowgraded moody new york feb moody ...
8,earn,champion product ch approves stock split roche...
9,other,computer terminal system cpml completes sale c...


In [42]:
df.to_csv('data_files/text_processed.csv', index=False)