# Natural Language Processing - Basic Text Pre-processing


## Importing libraries 

In [1]:
# Code to import libraries as you need in this assessment, e.g.,
import numpy as np
import pandas as pd
import re
import json
from itertools import chain
from sklearn.datasets import load_files  
from nltk import RegexpTokenizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import *


### 1.1 Examining and loading data
- To perform data loading, use sklearn load_files() function to load the data files.
- To examine the data folder, 
    - use len() function on dataset.filenames to count the number of loaded text files
    - use len() function on dataset.target_name to count number of loaded folders
    - use print dataset.filenames to check file path and text file name
    - use print dataset.target to check text file's categories assignment
    - use print dataset.target_name to check categories name
- Findings
    - The folder has 776 of job advertisements.
    - All job advertisements are in text file format.
    - The job advertisements are under 4 categories (Accounting_Finance, Engineering, Healthcare_Nursing, Sales)
    - The job advertisements categories distribution are as follows
        - Files in Accounting_Finance folder: 191
        - Files in Engineering folder: 231
        - Files in Healthcare_Nursing folder: 198
        - Files in Sales folder: 156
- Job Advertisments Format
    - Print out the first job advertisements to understand the format of the job advertisements file.
    - A job advertisement contain 4 pieces of information, they are Title, Webindex, Company, and Description.
    - All job advertisements were converted to lowercase format
    - All job advertisements were decoded to utf-8 format
- Store Job Advertisments  
    - All job advertisements were store to dataframe by Title, Webindex, Company, and Description using the split() function


In [2]:
# load data files
job_data = load_files(r"data/")  
print('Number of txt files: '+ str(len(job_data.filenames)))
print('Number of folders: '+ str(len(job_data.target_names)))

Number of txt files: 776
Number of folders: 4


In [3]:
# check filenames
filenames_arr = job_data['filenames']
filenames_arr

array(['data/Accounting_Finance\\Job_00382.txt',
       'data/Accounting_Finance\\Job_00354.txt',
       'data/Healthcare_Nursing\\Job_00547.txt',
       'data/Accounting_Finance\\Job_00246.txt',
       'data/Healthcare_Nursing\\Job_00543.txt',
       'data/Engineering\\Job_00089.txt',
       'data/Healthcare_Nursing\\Job_00580.txt',
       'data/Accounting_Finance\\Job_00419.txt',
       'data/Sales\\Job_00767.txt', 'data/Sales\\Job_00670.txt',
       'data/Accounting_Finance\\Job_00263.txt',
       'data/Accounting_Finance\\Job_00374.txt',
       'data/Engineering\\Job_00111.txt', 'data/Sales\\Job_00775.txt',
       'data/Engineering\\Job_00057.txt', 'data/Sales\\Job_00642.txt',
       'data/Sales\\Job_00657.txt', 'data/Engineering\\Job_00209.txt',
       'data/Sales\\Job_00746.txt',
       'data/Healthcare_Nursing\\Job_00479.txt',
       'data/Healthcare_Nursing\\Job_00491.txt',
       'data/Healthcare_Nursing\\Job_00454.txt',
       'data/Sales\\Job_00745.txt', 'data/Sales\\Job_006

In [4]:
# check target classes name
# 0 - Accounting_Finance, 1 - Engineering, 2 - Healthcare_Nursing, 3 - Sales  
job_data['target_names'] 

['Accounting_Finance', 'Engineering', 'Healthcare_Nursing', 'Sales']

In [5]:
# check target classes
target_arr = job_data['target']

print('Files in Accounting_Finance folder: ' + str(list(target_arr).count(0)))
print('Files in Engineering folder: ' + str(list(target_arr).count(1)))
print('Files in Healthcare_Nursing folder: ' + str(list(target_arr).count(2)))
print('Files in Sales folder: ' + str(list(target_arr).count(3)))

Files in Accounting_Finance folder: 191
Files in Engineering folder: 231
Files in Healthcare_Nursing folder: 198
Files in Sales folder: 156


In [6]:
# check format of the txt files
n = 0
advertisement, categoryidx = job_data.data, job_data.target  
print('Job Category Index= '+ str(categoryidx[n]))
print(advertisement[n])

Job Category Index= 0
b'Title: Finance / Accounts Asst Bromley to ****k\nWebindex: 68997528\nCompany: First Recruitment Services\nDescription: Accountant (partqualified) to **** p.a. South East London Our client, a successful manufacturing company has an immediate requirement for an Accountant for permanent role in their modern offices in South East London. The Role: Credit Control Purchase / Sales Ledger Daily collection of debts by phone, letter and email. Handling of ledger accounts Handling disputed accounts and negotiating payment terms Allocating of cash and reconciliation of accounts Adhoc administration duties within the business The Person The ideal candidate will have previous experience in a Credit Control capacity, you will possess exceptional customer service and communication skills together with IT proficiency. You will need to be a part or fully qualified Accountant to be considered for this role'


In [7]:
#decode advertisement
adv_ls = []
for x in job_data['data']:
    adv = x.decode('utf-8')
    adv_ls.append(adv)
    adv = ''

# load data to data frame
df = pd.DataFrame(data=np.c_[adv_ls, job_data['target']], 
                  columns=['advertisement', 'categoryidx'])
df['filename'] = job_data['filenames']

# split advertisement by subject
df['title'] = df['advertisement'].str.split(r'^Title:').str[1]
df[['title', 'webindex']] = df['title'].str.split(r'\nWebindex:', 1, expand=True)
df[['webindex', 'company']] = df['webindex'].str.split(r'\nCompany:', 1, expand=True)
df[['company', 'description']] = df['company'].str.split(r'\nDescription:', 1, expand=True)
df.loc[df['company'].isnull(), 'description'] = df[df['company'].isnull()]['webindex'].str.split(r'\nDescription:').str[1]
df.loc[df['company'].isnull(), 'webindex'] = df[df['company'].isnull()]['webindex'].str.split(r'\nDescription:').str[0]

df.head()

Unnamed: 0,advertisement,categoryidx,filename,title,webindex,company,description
0,Title: Finance / Accounts Asst Bromley to ****...,0,data/Accounting_Finance\Job_00382.txt,Finance / Accounts Asst Bromley to ****k,68997528,First Recruitment Services,Accountant (partqualified) to **** p.a. South...
1,Title: Fund Accountant Hedge Fund\nWebindex: ...,0,data/Accounting_Finance\Job_00354.txt,Fund Accountant Hedge Fund,68063513,Austin Andrew Ltd,One of the leading Hedge Funds in London is c...
2,Title: Deputy Home Manager\nWebindex: 68700336...,2,data/Healthcare_Nursing\Job_00547.txt,Deputy Home Manager,68700336,Caritas,An exciting opportunity has arisen to join an...
3,Title: Brokers Wanted Imediate Start\nWebindex...,0,data/Accounting_Finance\Job_00246.txt,Brokers Wanted Imediate Start,67996688,OneTwoTrade,OneTwoTrade is expanding their Sales Team and...
4,Title: RGN Nurses (Hospitals) Penarth\nWebind...,2,data/Healthcare_Nursing\Job_00543.txt,RGN Nurses (Hospitals) Penarth,71803987,Swiis Healthcare,RGN Nurses (Hospitals) Immediate fulltime and...


### 1.2 Pre-processing data
Perform the required text pre-processing steps.
- Use nltk tokenize sent_tokenize() function to tokenize the job advertisments
- To understand the work count statistic, word count, vocab count, lexical_diversity, and word length was printed out
- The tokenized list was then filtered for words with lengths less than 2 
- The tokenized list was then filtered for stopwords according to the stopwords_en.txt file
- The tokenized list was then filtered for words that appeared only once in the document collection based on term frequency. ntlk probability FreqDist() was used to locate the filter list.
- The tokenized list was then filtered for the top 50 most frequent words based on document frequency. ntlk probability FreqDist() and most_common() functions were used to locate the filter list.

In [8]:
# function - tokenize txt
def tokenizeTxt(input_txt):
    """
        This function first convert all words to lowercases, 
        it then segment the raw review into sentences and tokenize each sentences 
        and convert the review to a list of tokens.
    """
    # sent_tokenize and converted into the lower case
    sentences = sent_tokenize(input_txt.lower())
    
    # tokenize each sentence
    pattern = r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?"
    tokenizer = RegexpTokenizer(pattern) 
    token_lists = [tokenizer.tokenize(sen) for sen in sentences]
    
    # merge them into a list of tokens
    tokenised_txt = list(chain.from_iterable(token_lists))
    return tokenised_txt

In [9]:
# function - print statistic
def stats_print(input_txt):
    words = list(chain.from_iterable(input_txt)) 
    vocab = set(words) 
    lexical_diversity = len(vocab)/len(words)
    print("Vocabulary size: ",len(vocab))
    print("Total number of tokens: ", len(words))
    print("Lexical diversity: ", lexical_diversity)
    print("Total number of Job Advertisement:", len(input_txt))
    lens = [len(article) for article in input_txt]
    print("Average words length:", np.mean(lens))
    print("Maximun words length:", np.max(lens))
    print("Minimun words length:", np.min(lens))
    print("Standard deviation of words length:", np.std(lens))

In [10]:
# 2,3) execute tokenize txt
tk_adv_ls = []
for i, desc in df.description.iteritems():  
        tk_txt = tokenizeTxt(desc)
        tk_adv_ls.append(tk_txt)

# add tokenize txt to dataframe
df['tk_adv'] = tk_adv_ls

# print statistic
print('Tokenize Statistic:')
stats_print(tk_adv_ls)

Tokenize Statistic:
Vocabulary size:  9834
Total number of tokens:  186952
Lexical diversity:  0.052601737344345076
Total number of Job Advertisement: 776
Average words length: 240.91752577319588
Maximun words length: 815
Minimun words length: 13
Standard deviation of words length: 124.97750685071483


In [11]:
# 4) filter words with length less than 2

# capture words with length less than 2
st_list = [[w for w in txt if len(w) <= 1 ] \
                    for txt in tk_adv_ls] 
# remove words with length less than 2
tk_adv_ls2 = [[w for w in tk if len(w) >= 2 ] \
                    for tk in tk_adv_ls] 

# add filtered tokenize txt to dataframe
df['tk_adv2'] = tk_adv_ls2

# print statistic
print('Removal - words with length less than 2:')
print(set(list(chain.from_iterable(st_list))))
print('\n')
print("Before removal words with length less than 2:",len(list(chain.from_iterable(tk_adv_ls)))," tokens")
print("After removal words with length less than 2:",len(list(chain.from_iterable(tk_adv_ls2)))," tokens")
print('\n')
print('Tokenize Statistic:')
stats_print(tk_adv_ls2)

Removal - words with length less than 2:
{'m', 'j', 'i', 's', 'f', 'a', 't', 'z', 'q', 'o', 'n', 'r', 'u', 'x', 'y', 'p', 'h', 'w', 'l', 'd', 'k', 'c', 'g', 'v', 'e', 'b'}


Before removal words with length less than 2: 186952  tokens
After removal words with length less than 2: 180913  tokens


Tokenize Statistic:
Vocabulary size:  9808
Total number of tokens:  180913
Lexical diversity:  0.05421390392066905
Total number of Job Advertisement: 776
Average words length: 233.13530927835052
Maximun words length: 795
Minimun words length: 13
Standard deviation of words length: 121.6048654015839


In [12]:
# 5) filter stopwords

# load stopwords
stopwords_file = open("stopwords_en.txt", "r")
stopwords_data = stopwords_file.read()
stopwords_list = stopwords_data.split("\n")
print('Stopwords List:')
print(stopwords_list)

# remove stop words
tk_adv_ls3 = [[w for w in review if w not in stopwords_list] \
                      for review in tk_adv_ls2]

# add filtered tokenize txt to dataframe
df['tk_adv3'] = tk_adv_ls3

# print statistic
print('\n')
print("Before removal stopwords:",len(list(chain.from_iterable(tk_adv_ls2)))," tokens")
print("After removal stopwords:",len(list(chain.from_iterable(tk_adv_ls3)))," tokens")
print('\n')
print('Tokenize Statistic:')
stats_print(tk_adv_ls3)

Stopwords List:
['a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', "c'mon", "c's", 'came', 'can', "can't", 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'con

In [13]:
# 6) Remove the words that appear only once in the document collection, based on term frequency
# Term frequency - Number of counts of a word in the whole corpus regardless which document.

# prepare words list
tk_words_ls = list(chain.from_iterable(tk_adv_ls3))

# locate words that appear only once based on term frequency
term_fd_dict = FreqDist(tk_words_ls)
term_freq_1 = dict((k, v) for k, v in term_fd_dict.items() if v == 1)

print('Term frequency - words appear only once in the document collection:')
print(list(term_freq_1.keys()))

# remove stop words
tk_adv_ls4 = [[w for w in review if w not in term_freq_1.keys()] \
                      for review in tk_adv_ls3]

# add filtered tokenize txt to dataframe
df['tk_adv4'] = tk_adv_ls4

# print statistic
print('\n')
print("Before removal term freq words appear only once:",len(list(chain.from_iterable(tk_adv_ls3)))," tokens")
print("After removal term freq words appear only once:",len(list(chain.from_iterable(tk_adv_ls4)))," tokens")
print('\n')
print('Tokenize Statistic:')
stats_print(tk_adv_ls4)

Term frequency - words appear only once in the document collection:
['disputed', 'allocating', 'proficiency', 'equities', 'fluctuations', 'chiropodists', 'deputyhomemanager', 'onetwotrade', 'timehours', 'banding', 'referafriend', 'immunisation', 'outofhours', 'penarth', 'postqualification', 'faced', 'arriving', 'facsimile', 'retrieve', 'photocopying', 'collating', 'xp', 'solvitt', 'thoughts', 'goaloriented', 'laminar', 'sterile', 'presents', 'ortho', 'ophthalmic', 'gynae', 'exemplary', 'mixes', 'remittances', 'duplicates', 'chaps', 'currencies', 'salespurchaseledgerclerkmaternitycover', 'embarking', 'faint', 'hearted', 'recruitmentsalesexecutive', 'da', 'br', 'personalities', 'recycling', 'washroom', 'pest', 'businessdevelopmentexecutivefieldsalesdartford', 'kris', 'shortfalls', 'remediate', 'custodians', 'transacted', 'august', 'clarke', 'investmentstreasurycontroller', 'batley', 'castleford', 'morley', 'pontefract', 'porduction', 'susurface', 'promary', 'maturation', 'reservior', 'un

In [14]:
# 7) Remove the top 50 most frequent words based on document frequency.
# document frequency - the number of documents containing a word

# prepare words list
tk_words_ls = list(chain.from_iterable(tk_adv_ls4))

# locate the top 50 most common words based on document frequency
term_fd_dict = FreqDist(tk_words_ls)
doc_freq_50 = dict(term_fd_dict.most_common(50))

print('Document frequency - top 50 most frequent words:')
print(list(doc_freq_50.keys()))

# remove stop words
tk_adv_ls5 = [[w for w in review if w not in doc_freq_50.keys()] \
                      for review in tk_adv_ls4]

# add filtered tokenize txt to dataframe
df['tk_adv5'] = tk_adv_ls5

# print statistic
print('\n')
print("Before removal term freq words appear only once:",len(list(chain.from_iterable(tk_adv_ls4)))," tokens")
print("After removal term freq words appear only once:",len(list(chain.from_iterable(tk_adv_ls5)))," tokens")
print('\n')
print('Tokenize Statistic:')
stats_print(tk_adv_ls5)

Document frequency - top 50 most frequent words:
['experience', 'sales', 'role', 'work', 'business', 'team', 'working', 'job', 'care', 'skills', 'company', 'client', 'management', 'manager', 'support', 'uk', 'service', 'excellent', 'development', 'required', 'based', 'opportunity', 'services', 'knowledge', 'apply', 'successful', 'training', 'design', 'engineering', 'customer', 'recruitment', 'salary', 'candidate', 'clients', 'high', 'join', 'ability', 'strong', 'provide', 'home', 'ensure', 'leading', 'including', 'engineer', 'financial', 'good', 'staff', 'position', 'systems', 'full']


Before removal term freq words appear only once: 102975  tokens
After removal term freq words appear only once: 80068  tokens


Tokenize Statistic:
Vocabulary size:  5168
Total number of tokens:  80068
Lexical diversity:  0.06454513663386122
Total number of Job Advertisement: 776
Average words length: 103.18041237113403
Maximun words length: 390
Minimun words length: 7
Standard deviation of words length

In [15]:
df2 = df.drop(['tk_adv', 'tk_adv2','tk_adv3', 'tk_adv4'], axis=1)
df2.head(6)

Unnamed: 0,advertisement,categoryidx,filename,title,webindex,company,description,tk_adv5
0,Title: Finance / Accounts Asst Bromley to ****...,0,data/Accounting_Finance\Job_00382.txt,Finance / Accounts Asst Bromley to ****k,68997528,First Recruitment Services,Accountant (partqualified) to **** p.a. South...,"[accountant, partqualified, south, east, londo..."
1,Title: Fund Accountant Hedge Fund\nWebindex: ...,0,data/Accounting_Finance\Job_00354.txt,Fund Accountant Hedge Fund,68063513,Austin Andrew Ltd,One of the leading Hedge Funds in London is c...,"[hedge, funds, london, recruiting, fund, accou..."
2,Title: Deputy Home Manager\nWebindex: 68700336...,2,data/Healthcare_Nursing\Job_00547.txt,Deputy Home Manager,68700336,Caritas,An exciting opportunity has arisen to join an...,"[exciting, arisen, establish, provider, elderl..."
3,Title: Brokers Wanted Imediate Start\nWebindex...,0,data/Accounting_Finance\Job_00246.txt,Brokers Wanted Imediate Start,67996688,OneTwoTrade,OneTwoTrade is expanding their Sales Team and...,"[expanding, recruiting, junior, trainee, broke..."
4,Title: RGN Nurses (Hospitals) Penarth\nWebind...,2,data/Healthcare_Nursing\Job_00543.txt,RGN Nurses (Hospitals) Penarth,71803987,Swiis Healthcare,RGN Nurses (Hospitals) Immediate fulltime and...,"[rgn, nurses, hospitals, fulltime, part, swiis..."
5,Title: Production Coordinator\nWebindex: 70322...,1,data/Engineering\Job_00089.txt,Production Coordinator,70322392,,Production Coordinator Sandbach Salary: pound...,"[production, coordinator, sandbach, pound, nda..."


## Saving required outputs
Save the vocabulary, bigrams and job advertisment txt as per spectification.
- vocab.txt

In [16]:
# 8) Save all job advertisement text and information in txt file(s) 
# export job adv description
def save_adv(input_filename,input_txt):
    out_file = open(input_filename, 'w') 
    string = "\n".join([" ".join(x) for x in input_txt])
    out_file.write(string)
    out_file.close()     
save_adv('job_adv_string.txt',tk_adv_ls5)    

# export dataframe to file
df2.to_csv('job_adv.txt', index=None, sep=' ')

# export job adv description tokenize list to file
with open('job_adv_json.txt', 'w') as filehandle:
    json.dump(tk_adv_ls5, filehandle)

In [17]:
# 9) Build a vocabulary of the cleaned job advertisement descriptions, save it in a txt file (vocab.txt)
def save_vocab(input_filename,input_txt):
    words_ls = list(chain.from_iterable(input_txt)) 
    vocab_ls = list(set(words_ls)) 
    vocab_ls.sort()
    string = "\n".join("{}:{}".format(v, i) for i, v in enumerate(vocab_ls))
    out_file = open(input_filename, 'w') 
    out_file.write(string)
    out_file.close() 
    
save_vocab('vocab.txt',tk_adv_ls5)        

## Summary

After tokenizing the job advertisement list, removal of stopwords, single character words, most common words, and words that only appear one time, the vocabulary count has changed from 9834 to 5168.  The vocabulary filter distribution as follow.
   - vocab count before filtering: 9834
   - after single character words removal: 9808 (less 26 vocab)
   - after stopwords removal: 9404 (less 404 vocab)
   - after words that only appear once removal: 5218 (less 4186 vocab)
   - after most commonn 50 words removal: 5168 (less 50 vocab)    

Through the unrelated vocabulary removal, the lexical diversity ratio has increased from 0.053 to 0.064. The lexical diversity ratio distribution is as follows.
   - vocab count before filtering: 0.053
   - after single character words removal: 0.054
   - after stopwords removal: 0.088
   - after words that only appear once removal: 0.051
   - after most commonn 50 words removal: 0.065    

