In [1]:
#import standard things
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#import file/os things
from pathlib import Path
import glob
import codecs
import pickle

#import data science things
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn

#Setup data directory variable for global use
data_dir = Path.home() / "Desktop" / "bah-intermediate" / "CAPSTONE" / "data"

In [None]:
#Download nltk things
#Only use if you've not already downloaded nltk things
#nltk.download()

In [2]:
#get text from files and store in a string variable 'text'
def get_text(file_path):
    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
            
    return text        

In [3]:
#Process the dataframe, takes 2 arguments: dataframe and language; language defaults to english if none passed
def process_df(df, language='english'):
    #set stop words based on input language
    stop_words = set(stopwords.words(language))
    
    #lambda below uses list comprehension to create new column in dataframe called 'tokens'; 
    #the function converts the value of df.text to string then converts to lowercase;
    df['tokens'] = df.text.apply(lambda x: [w for w in word_tokenize(str(x).lower())])
    
    #lambda below uses list comprehension to stem all words in the 'tokens' column
    #if the word is not in stop_words and if the word is a word
    df['stemmed_list'] = df.tokens.apply(lambda x: [PorterStemmer().stem(w) for w in x 
                                                          if w not in stop_words and w.isalpha() and len(w) > 2])
    
    #lambda below uses .join() to create one string from list of words in 'stemmed_list'
    df['stemmed'] = df.stemmed_list.apply(lambda x: ' '.join(x))
    
    # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
    # By default it is set to Noun
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    
    for i, entry in enumerate(df.tokens):
        #temp_words
        temp_words = []
        
        # Initializing WordNetLemmatizer()
        word_lemmatized = WordNetLemmatizer()
    
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabet
            if word not in stop_words and word.isalpha() and len(word) > 2:
                word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
                temp_words.append(word_final)
        
        final_words.append(temp_words)
        
    # The final processed set of words for each iteration will be stored in 'lemmed_list'
    df['lemmed_list'] = final_words
    
    #lambda below uses .join() to create one string from list of words in 'lemmed_list'
    df['lemmed'] = df.lemmed_list.apply(lambda x: ' '.join(x))
        
    #create a column 'word_count' that is simple count of all words in 'stemmed_list' after processing
    df['word_count'] = df.stemmed_list.apply(lambda x: len(x))
    
    pickle.dump(df, open(data_dir / 'bbc_df_processed.pickle', 'wb'))
    
    return df

In [4]:
#Declare empty dataframe
df = pd.DataFrame()

In [5]:
#Read in files and recursively build dataframe
#glob is building iterator for all '.txt' files in directory tree (recursive)
#df.append is appending 'topic', 'title', and 'text' for each file
for file_path in glob.iglob('./data/**/*.txt', recursive=True):
    df = df.append({'topic': file_path.split('\\')[1], 
                    'title': get_text(file_path).split('\n')[0],
                    'text': get_text(file_path)},
                    ignore_index=True)

In [None]:
df.tail()

In [6]:
pickle.dump(df, open(data_dir / "bbc_text.pickle", "wb"))

In [7]:
processed_df = process_df(df)

In [8]:
processed_df.head()

Unnamed: 0,text,title,topic,tokens,stemmed_list,stemmed,lemmed_list,lemmed,word_count
0,Ad sales boost Time Warner profit\n\nQuarterly...,Ad sales boost Time Warner profit,business,"[ad, sales, boost, time, warner, profit, quart...","[sale, boost, time, warner, profit, quarterli,...",sale boost time warner profit quarterli profit...,"[sale, boost, time, warner, profit, quarterly,...",sale boost time warner profit quarterly profit...,221
1,Dollar gains on Greenspan speech\n\nThe dollar...,Dollar gains on Greenspan speech,business,"[dollar, gains, on, greenspan, speech, the, do...","[dollar, gain, greenspan, speech, dollar, hit,...",dollar gain greenspan speech dollar hit highes...,"[dollar, gain, greenspan, speech, dollar, hit,...",dollar gain greenspan speech dollar hit high l...,212
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos unit buyer faces loan claim,business,"[yukos, unit, buyer, faces, loan, claim, the, ...","[yuko, unit, buyer, face, loan, claim, owner, ...",yuko unit buyer face loan claim owner embattl ...,"[yukos, unit, buyer, face, loan, claim, owner,...",yukos unit buyer face loan claim owner embattl...,149
3,High fuel prices hit BA's profits\n\nBritish A...,High fuel prices hit BA's profits,business,"[high, fuel, prices, hit, ba, 's, profits, bri...","[high, fuel, price, hit, profit, british, airw...",high fuel price hit profit british airway blam...,"[high, fuel, price, hit, profit, british, airw...",high fuel price hit profit british airway blam...,216
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod takeover talk lifts Domecq,business,"[pernod, takeover, talk, lifts, domecq, shares...","[pernod, takeov, talk, lift, domecq, share, dr...",pernod takeov talk lift domecq share drink foo...,"[pernod, takeover, talk, lift, domecq, share, ...",pernod takeover talk lift domecq share drink f...,152


In [None]:
plt.hist(processed_df.word_count, bins =
len(set(processed_df.word_count)))
plt.xlabel('Number of words per Article')
plt.ylabel('Frequency')

In [None]:
processed_df.word_count.describe()

In [None]:
processed_df.topic.value_counts()

In [None]:
processed_df.query('word_count > 500').count()

In [None]:
processed_df.query('word_count > 500').topic.value_counts()

In [9]:
trimmed_df = processed_df.query('word_count <= 500')
trimmed_df.word_count.describe()

count    2200.000000
mean      198.202273
std        86.017350
min        46.000000
25%       132.000000
50%       178.000000
75%       251.000000
max       499.000000
Name: word_count, dtype: float64

In [None]:
plt.hist(trimmed_df.word_count, bins =
len(set(trimmed_df.word_count)))
plt.xlabel('Number of words per Article')
plt.ylabel('Frequency')

In [10]:
pickle.dump(trimmed_df, open(data_dir / 'bbc_df_trimmed.pickle', 'wb'))