In [1]:
#import the necessary packages
import numpy as np
import pandas as pd
import nltk
import string

#nltk.download('stopwords')
#nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize


### Read in EAE document
Read in EAE Data

In [2]:
# Read in the application data scrapped from EAE pdf
df_merge = pd.read_csv('final_merged.csv', index_col=0)


### Prepare data for model scoring
Removing Tokenize, Lower Case, Punctuation, Numbers, Stopwords and Common words and Stemming

In [3]:
# tokenize and lowercase
df_merge['Final Writeup'] = df_merge['Final Writeup'].apply(lambda x: word_tokenize(x.lower()))
df_merge.head()

Unnamed: 0,UID,Choice,Course Code,Final Writeup
0,C35A1W1,1,C35,"[i, believe, in, this, technological, advanced..."
1,C35A2W1,1,C35,"[the, business, and, financial, technology, co..."
2,C35A3W1,1,C35,"[i, fancy, numbers, and, i, always, want, to, ..."
3,C35A4W1,1,C35,"[i, foresee, that, there, is, a, growth, and, ..."
4,C35A5W1,1,C35,"[we, are, into, high, technology, era, where, ..."


In [4]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()

stopwords_english = set(stopwords.words('english'))

#add common words to stop words
common_words =['school', 'course', 'also', 'secondary']
stopwords_english.update(common_words)

def clean(doc):
    all_words_clean = []
    for word in doc:
        # remove stopwords and number
        if word not in stopwords_english and not word.isdigit():
            # remove punctuation
            punc_free = ''.join([ch for ch in word if ch not in string.punctuation])
            if len(punc_free)>=2:
                # stem word to root word
                all_words_clean.append(porter_stemmer.stem(punc_free))
    
    return all_words_clean

df_merge['Final Writeup'] = df_merge['Final Writeup'].apply(lambda x: clean(x))
df_merge.head()

Unnamed: 0,UID,Choice,Course Code,Final Writeup
0,C35A1W1,1,C35,"[believ, technolog, advanc, world, busi, longe..."
1,C35A2W1,1,C35,"[busi, financi, technolog, caught, ey, wish, l..."
2,C35A3W1,1,C35,"[fanci, number, alwai, want, know, much, profi..."
3,C35A4W1,1,C35,"[forese, growth, demand, busi, near, futur, mo..."
4,C35A5W1,1,C35,"[high, technolog, era, peopl, us, comput, tabl..."


Generate Bigrams for Writeup

In [5]:
def get_bigram(text):
    bigramlist = list(nltk.ngrams(text, 2))
    bigrams_text = []
    for bigram in bigramlist:
        bigrams_text.append(" ".join([word  for word in bigram]))
    return bigrams_text

df_merge['Bigram Writeup'] = df_merge['Final Writeup'].apply(lambda x: get_bigram(x))
df_merge.head()

Unnamed: 0,UID,Choice,Course Code,Final Writeup,Bigram Writeup
0,C35A1W1,1,C35,"[believ, technolog, advanc, world, busi, longe...","[believ technolog, technolog advanc, advanc wo..."
1,C35A2W1,1,C35,"[busi, financi, technolog, caught, ey, wish, l...","[busi financi, financi technolog, technolog ca..."
2,C35A3W1,1,C35,"[fanci, number, alwai, want, know, much, profi...","[fanci number, number alwai, alwai want, want ..."
3,C35A4W1,1,C35,"[forese, growth, demand, busi, near, futur, mo...","[forese growth, growth demand, demand busi, bu..."
4,C35A5W1,1,C35,"[high, technolog, era, peopl, us, comput, tabl...","[high technolog, technolog era, era peopl, peo..."


### Load Dictionary 
Load dictionary of unique word/term. 

In [6]:
import pickle

unigram_dict = pickle.load(open('unigram_dict.sav', 'rb'))
bigram_dict = pickle.load(open('bigram_dict.sav', 'rb'))

In [7]:
# unigram dictionary
print("Total unique words:")
print(len(unigram_dict.token2id))
print("\nSample data from dictionary:")
i = 0
# Print top 4 (word, id) tuples
for key in unigram_dict.token2id.keys():
    print("Word: {} - ID: {} ".format(key, unigram_dict.token2id[key]))
    if i == 3:
        break
    i += 1

Total unique words:
6483

Sample data from dictionary:
Word: activ - ID: 0 
Word: advanc - ID: 1 
Word: afternoonworth - ID: 2 
Word: analyt - ID: 3 


In [8]:
# bigram dictionary
print("Total unique words:")
print(len(bigram_dict.token2id))
print("\nSample data from dictionary:")
i = 0
# Print top 4 (word, id) tuples
for key in bigram_dict.token2id.keys():
    print("Word: {} - ID: {} ".format(key, bigram_dict.token2id[key]))
    if i == 3:
        break
    i += 1

Total unique words:
55183

Sample data from dictionary:
Word: activ centr - ID: 0 
Word: activ interact - ID: 1 
Word: activ member - ID: 2 
Word: advanc ecommerc - ID: 3 


### Generate BOW vectors
Tokenized sentence is represented by an array of frequency of each word from the dictionary in the sentence

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# One-Hot encoding for course code
encoder = OneHotEncoder(sparse=False)
cse_df = pd.DataFrame(encoder.fit_transform(df_merge[['Course Code']]), index=df_merge.index)
cse_df.columns = encoder.get_feature_names()

print(cse_df.shape)
display (cse_df.isnull().sum())
cse_df.head()

(1345, 6)


x0_C35    0
x0_C36    0
x0_C43    0
x0_C54    0
x0_C80    0
x0_C85    0
dtype: int64

Unnamed: 0,x0_C35,x0_C36,x0_C43,x0_C54,x0_C80,x0_C85
0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0


In [12]:
feature_set = cse_df
feature_set['x_choice'] = df_merge['Choice']
feature_set['x_writeup'] = df_merge['Final Writeup']
feature_set['x_bigram_writeup'] = df_merge['Bigram Writeup']

x = feature_set

print (x.shape)
display (x.isnull().sum())
display (x.head())

(1345, 9)


x0_C35              0
x0_C36              0
x0_C43              0
x0_C54              0
x0_C80              0
x0_C85              0
x_choice            0
x_writeup           0
x_bigram_writeup    0
dtype: int64

Unnamed: 0,x0_C35,x0_C36,x0_C43,x0_C54,x0_C80,x0_C85,x_choice,x_writeup,x_bigram_writeup
0,1.0,0.0,0.0,0.0,0.0,0.0,1,"[believ, technolog, advanc, world, busi, longe...","[believ technolog, technolog advanc, advanc wo..."
1,1.0,0.0,0.0,0.0,0.0,0.0,1,"[busi, financi, technolog, caught, ey, wish, l...","[busi financi, financi technolog, technolog ca..."
2,1.0,0.0,0.0,0.0,0.0,0.0,1,"[fanci, number, alwai, want, know, much, profi...","[fanci number, number alwai, alwai want, want ..."
3,1.0,0.0,0.0,0.0,0.0,0.0,1,"[forese, growth, demand, busi, near, futur, mo...","[forese growth, growth demand, demand busi, bu..."
4,1.0,0.0,0.0,0.0,0.0,0.0,1,"[high, technolog, era, peopl, us, comput, tabl...","[high technolog, technolog era, era peopl, peo..."


In [16]:
import gensim

vocab_len = len(unigram_dict)

def get_bow_features(X_test, stemmed_tokens):
    test_features = []
    for index, row in X_test.iterrows():
        # Converting the tokens into the format that the model requires
        features = gensim.matutils.corpus2csc([unigram_dict.doc2bow(row[stemmed_tokens])],num_terms=vocab_len).toarray()[:,0]
        test_features.append(features)
    return test_features

header = ",".join(str(unigram_dict[ele]) for ele in range(vocab_len))

In [17]:
# unigram features
unigram_features = pd.DataFrame(get_bow_features(x, 'x_writeup'),                            
                            columns=header.split(','), index = x.index)

x_features = pd.merge(x, unigram_features, left_index=True, right_index=True)
x_features.drop('x_writeup', axis=1, inplace=True)

x = x_features
x.head()

Unnamed: 0,x0_C35,x0_C36,x0_C43,x0_C54,x0_C80,x0_C85,x_choice,x_bigram_writeup,activ,advanc,...,capabilitiesalso,civilian,developedi,logisticsa,riffl,safekeep,squar,technologycomput,trainingi,tricksdur
0,1.0,0.0,0.0,0.0,0.0,0.0,1,"[believ technolog, technolog advanc, advanc wo...",3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1,"[busi financi, financi technolog, technolog ca...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1,"[fanci number, number alwai, alwai want, want ...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1,"[forese growth, growth demand, demand busi, bu...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1,"[high technolog, technolog era, era peopl, peo...",1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
bigram_vocab_len = len(bigram_dict)

def get_bow_bigram_features(X_test, stemmed_tokens):
    test_features = []
    for index, row in X_test.iterrows():
        # Converting the tokens into the format that the model requires
        features = gensim.matutils.corpus2csc([bigram_dict.doc2bow(row[stemmed_tokens])],
                                              num_terms=bigram_vocab_len).toarray()[:,0]
        test_features.append(features)
    return test_features

header2 = ",".join(str(bigram_dict[ele]) for ele in range(bigram_vocab_len))

In [19]:
# bigram features - training data
bigram_features = pd.DataFrame(get_bow_bigram_features(x, 'x_bigram_writeup'),                            
                            columns=header2.split(','), index = x.index)

x_features = pd.merge(x, bigram_features, left_index=True, right_index=True)
x_features.drop('x_bigram_writeup', axis=1, inplace=True)

x = x_features
x.head()

Unnamed: 0,x0_C35,x0_C36,x0_C43,x0_C54,x0_C80,x0_C85,x_choice,activ,advanc,afternoonworth,...,scammer fall,serangoon commun,solut developedi,spot scammer,squad on,squar drill,ssgt role,technologycomput platform,trainingi particip,tricksdur speech
0,1.0,0.0,0.0,0.0,0.0,0.0,1,3.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load Text Classification Model and Score data
Load finalize text classification model and score the EAE data.

In [20]:
import pickle

filename = 'finalized_model.sav'# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

results = pd.DataFrame(loaded_model.predict(x), columns=['predict_recommend'], index = x.index)
final = pd.read_csv('final_merged_spellcorrect.csv', index_col=0)
final = pd.merge(final, results, left_index=True, right_index=True)
final.head()

Unnamed: 0,UID,Choice,Course Code,Recommended,Final Writeup,predict_recommend
0,C35A1W1,1,C35,N,I believe in this technological advanced world...,0
1,C35A2W1,1,C35,N,The Business and Financial Technology course h...,0
2,C35A3W1,1,C35,Y,I fancy numbers and I always want to know how ...,0
3,C35A4W1,1,C35,Y,I foresee that there is a growth and demand in...,1
4,C35A5W1,1,C35,N,We are into high technology era where people u...,0


In [21]:
final.to_csv('final_merged_predict.csv')