In [0]:
import pandas as pd
import numpy as np
import io

# Keras imports
import keras
from keras.preprocessing import text, sequence
from keras.utils import np_utils
from keras.models import Model
from keras.layers import Input, Dense

# NLTK imports
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [49]:
# df2 = pd.read_csv(io.BytesIO(uploaded['resume_data.csv']))
# Import raw csv from github
url = 'https://raw.githubusercontent.com/hsoJLeu/resumAI/master/data/resume_dataset.csv'               # original data
# url = 'https://raw.githubusercontent.com/hsoJLeu/resumAI/master/data/Clean-Culture.csv'                  # Only engineering and IT

df = pd.read_csv(url)
df.head()

Unnamed: 0,ID,Category,Resume
0,1,HR,"b'John H. Smith, P.H.R.\n800-991-5187 | PO Box..."
1,2,HR,b'Name Surname\nAddress\nMobile No/Email\nPERS...
2,3,HR,b'Anthony Brown\nHR Assistant\nAREAS OF EXPERT...
3,4,HR,b'www.downloadmela.com\nSatheesh\nEMAIL ID:\nC...
4,5,HR,"b""HUMAN RESOURCES DIRECTOR\n\xef\x82\xb7Expert..."


In [50]:
# Data cleaning

from io import StringIO
col = ['Category', 'Resume']
df = df[col]
df = df[pd.notnull(df['Resume'])]
df.columns = ['Category', 'Resume']
df['category_id'] = df['Category'].factorize()[0]
# clean category -> category Id
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)
df

Unnamed: 0,Category,Resume,category_id
0,HR,"b'John H. Smith, P.H.R.\n800-991-5187 | PO Box...",0
1,HR,b'Name Surname\nAddress\nMobile No/Email\nPERS...,0
2,HR,b'Anthony Brown\nHR Assistant\nAREAS OF EXPERT...,0
3,HR,b'www.downloadmela.com\nSatheesh\nEMAIL ID:\nC...,0
4,HR,"b""HUMAN RESOURCES DIRECTOR\n\xef\x82\xb7Expert...",0
5,HR,"b'John H. Smith, P.H.R.\n800-991-5187 | PO Box...",0
6,HR,b'Resume of Satheesh\n\nwww.downlo\nSatheesh\n...,0
7,HR,"b""GM HR & ADMINISTRATION Resume Sample www.tim...",0
8,HR,"b""www.uaehrzone.com\n\nRobert Wales\nDubai\nUn...",0
9,HR,"b""Human Resources Coordinator Resume\nExample\...",0


In [51]:
# Separating rows here
filter_list = ['Information Technology', 'Engineering']
tech_list = df[df.Category.isin(filter_list)]
tech_list

# Alternative way 
# techies = df.loc[(df['category_id'] == 3 ) | (df['category_id'] == 17 )]

# Perhaps convert Category to numerical value for labeling




Unnamed: 0,Category,Resume,category_id
166,Information Technology,"b'RESUME\nAJITHA SHENOY .K.B,\nPhD student (Co...",3
167,Information Technology,b'Mason\t\r \xc2\xa0Silber\t\r \xc2\xa0\n6595\...,3
168,Information Technology,b'Pramod XXXX\nMobile: +91-99********\n\nE-mai...,3
169,Information Technology,"b""Harry M. Rohrer\n3748 Bee Street\nGrand Rapi...",3
170,Information Technology,"b""Wilson Kunnan Jose\nSr. Consultant, QA\n\nSu...",3
171,Information Technology,b'Resume\nFor\n\nInternship\nor\nCo-OP\nWithou...,3
172,Information Technology,b'Jan Stolarek\nInformatics Forum 5.10\n10 Cri...,3
173,Information Technology,"b'Stan Taylor\n19217 Kennemer Drive, Pflugervi...",3
174,Information Technology,b'SOFTWARE ENGINEER\nDEVELOPER RESUME\nDEVELOP...,3
175,Information Technology,b'Gary White\nPHP developer\nAREAS OF EXPERTIS...,3


In [65]:
# Pre processing with NLTK

# Get first resume
tokenizer = RegexpTokenizer(r'\w+')                                           # testing regex tokenizer 
# tokens = word_tokenize(tech_list.iloc[3, 1])                                   # word_tokenize does not handle punctuations sadly :( 
tokens = tokenizer.tokenize(tech_list.iloc[0, 1])                                # run tokenizer on resume @ [row, col]

# nty_list = [',', 'the', ':']
# for chars in token:
#   if(token[chars] == nty_list):
    
# Stem words so they'll be easier to capture later
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
print(stemmed[:500])

# Pre processing with Keras
text1 = tech_list.iloc[0,1]
print(text1)
# keras text_to_word_sequence
filter_text = keras.preprocessing.text.text_to_word_sequence(text1, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
print(filter_text)


# keras tokenizer
# text1 = keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)


['b', 'resum', 'najitha', 'shenoy', 'K', 'B', 'nphd', 'student', 'comput', 'scienc', 'nroom', 'No', 'CS', '106', 'ndepart', 'of', 'comput', 'scienc', 'niit', 'kanpur', 'nup', 'india', 'nE', 'mail', 'ajithshenoy2003', 'yahoo', 'com', 'or', 'najith', 'cse', 'iitk', 'ac', 'in', 'or', 'ajith', 'iitk', 'ac', 'in', 'nmobil', '91', '9305326504', '91', 'xe2', 'x80', 'x93', '9839791934', 'R', 'n', 'ncareer', 'object', 'To', 'pursu', 'a', 'challeng', 'career', 'and', 'be', 'part', 'of', 'a', 'progress', 'norgan', 'that', 'give', 'scope', 'to', 'enhanc', 'my', 'knowledg', 'skill', 'and', 'to', 'reach', 'the', 'pinnacl', 'nin', 'the', 'comput', 'and', 'research', 'field', 'with', 'sheer', 'determin', 'dedic', 'and', 'hard', 'work', 'n', 'nwork', 'experi', 'n', 'xef', 'x81', 'xac', 'n', 'n', 'xef', 'x82', 'xb7', 'n', 'xef', 'x82', 'xb7', 'n', 'npursu', 'research', 'in', 'the', 'depart', 'of', 'comput', 'scienc', 'and', 'engin', 'nat', 'iit', 'kanpur', 'nwork', 'as', 'a', 'lectur', 'in', 'the', 'dep

In [53]:
# another part of pre processing, not working currently
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

doc = tech_list.iloc[0, 1]
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
normalize_corpus

<numpy.lib.function_base.vectorize at 0x7f7ce49aeef0>

In [61]:

# Word2Vec HERE
# build vocabulary of unique words
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(stemmed)
word2id = tokenizer.word_index

vocab_size = len(word2id)

# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in stemmed]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])


AttributeError: ignored

In [0]:
# Split train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)


In [0]:
# example Keras model

model = Sequential()

model.add(Dense(32), input_shape(16))
model.