   
# Resume Parse
## 1. Import packages
  

In [1]:
import re
import string
import nltk
import time
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from matplotlib import pyplot
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from statsmodels.graphics.gofplots import qqplot


# supress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

%matplotlib inline

## 2. Wrangle data
1. Read data
2. Clean data
    1. Remove stops
    2. Remove punctuation
3. Stem data
4. Add features
    1. Line length: total non-whitespace characters
    2. Verb percentage: portion of verbs per line
    3. Stop word percentage: portion of stop words per line
    4. Punctuation percentage: portion of punction per line

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
verb_tags = ['VB','VBG','VBD', 'VBN','VBN-HL','VERB']
adj_tags = ['JJ', 'JJR','JJS']
data = pd.read_csv('../dev/raw_in.csv', sep=',')

data


Unnamed: 0,line,label
0,Bernard Foster,Name
1,"Smyrna, GA 30082 ...",Contact
2,Bernard Foster is a Solutions Architect with a...,Summary
3,Professional Experience,Heading
4,"Research 1, Inc S...",Unknown
...,...,...
1001,November 2003 - March 2007,Date
1002,Earned $2344.00 in cash awards and 16 vacation...,Bullet
1003,EDUCATION,Heading
1004,"Bachelor of Science, Business Information Syst...",Degree


## Clean and Create Features

In [3]:
def remove_stopwords(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [word for word in tokens if word not in stopwords]
    return text

def get_stems(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

def count_words(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+',text)
    return len(tokens)

def count_verbs(pos_tags: list) -> int:
    count = sum([1 for pair in pos_tags if pair[1] in verb_tags])
    return round(count/(len(pos_tags)),3)*100

def count_adj(pos_tags: list) -> int:
    count = sum([1 for pair in pos_tags if pair[1] in adj_tags])
    return round(count/(len(pos_tags)),3)*100

def count_nums(pos_tags: list) -> int:
    count = sum([1 for pair in pos_tags if pair[1] == 'CD'])
    return round(count/(len(pos_tags)),3)*100

def count_proper_nouns(pos_tags: list) -> int:
    count = sum([1 for pair in pos_tags if pair[1] == 'NNP'])
    return round(count/(len(pos_tags)),3)*100

def count_stopwords(line: str) -> int:
    count = sum([1 for word in nltk.word_tokenize(line) if word in nltk.corpus.stopwords.words('english')])
    return round(count/(len(line) - line.count(' ')),3)*100

def count_punct(line: str) -> int:
    count = sum([1 for char in line if char in string.punctuation])
    return round(count/(len(line) - line.count(' ')), 3)*100

# Remove stop words
data['line_nostop'] = data['line'].apply(lambda x: remove_stopwords(x.lower()))

# Collect the word stems
data['line_stemmed'] = data['line_nostop'].apply(lambda x: get_stems(x))

# Find line length
data['line_length'] = data['line'].apply(lambda x: len(x) - x.count(' '))

# Find word count per line
data['word_count'] = data['line'].apply(lambda x: count_words(x))

# Tag each line with parts of speach
data['tagged_line'] = data['line'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))

# Find the percentage of verbs in each line
data['verb_percentage'] = data['tagged_line'].apply(lambda x: count_verbs(x))

# Find the percentage of adjectives in each line
data['adj_percentage'] = data['tagged_line'].apply(lambda x: count_adj(x))

# Find the percentage of stop words in each line
data['stopword_percentage'] = data['line'].apply(lambda x: count_stopwords(x))

# Find the percentage of punctuation in each line
data['punctuation_percentage'] = data['line'].apply(lambda x: count_punct(x))

# Find the percentage of numbers in each line
data['number_percentage'] = data['tagged_line'].apply(lambda x: count_nums(x))

# Find the percentage of numbers in each line
data['proper_noun_percentage'] = data['tagged_line'].apply(lambda x: count_proper_nouns(x))

# Turn muiltiple class labels into one-vs-all-type binary labels (1 or 0)
# with the label name as the column name

lb = LabelBinarizer()

data = data.join(pd.DataFrame(lb.fit_transform(data['label']),
                          columns=lb.classes_, 
                          index=data.index))

data.head(10)

Unnamed: 0,line,label,line_nostop,line_stemmed,line_length,word_count,tagged_line,verb_percentage,adj_percentage,stopword_percentage,...,Interest,Location,Name,Organization,Skill,Summary,Title,Training,Unknown,Volunteer
0,Bernard Foster,Name,"[bernard, foster]","[bernard, foster]",13,2,"[(Bernard, NNP), (Foster, NNP)]",0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
1,"Smyrna, GA 30082 ...",Contact,"[smyrna, ga, 30082, cell, 404, 6616742, bernar...","[smyrna, ga, 30082, cell, 404, 6616742, bernar...",57,7,"[(Smyrna, NNP), (,, ,), (GA, NNP), (30082, CD)...",0.0,7.7,0.0,...,0,0,0,0,0,0,0,0,0,0
2,Bernard Foster is a Solutions Architect with a...,Summary,"[bernard, foster, solutions, architect, vast, ...","[bernard, foster, solut, architect, vast, amou...",698,104,"[(Bernard, NNP), (Foster, NNP), (is, VBZ), (a,...",4.1,4.1,4.9,...,0,0,0,0,0,1,0,0,0,0
3,Professional Experience,Heading,"[professional, experience]","[profession, experi]",22,2,"[(Professional, JJ), (Experience, NN)]",0.0,50.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,"Research 1, Inc S...",Unknown,"[research, 1, inc, salesforce, administratorco...","[research, 1, inc, salesforc, administratorcon...",67,9,"[(Research, NN), (1, CD), (,, ,), (Inc, NNP), ...",10.0,0.0,1.5,...,0,0,0,0,0,0,0,0,1,0
5,As a Salesforce Administrator in Research 1's ...,Bullet,"[salesforce, administrator, research, 1s, info...","[salesforc, administr, research, 1s, inform, t...",476,66,"[(As, IN), (a, DT), (Salesforce, NNP), (Admini...",6.6,5.3,3.4,...,0,0,0,0,0,0,0,0,0,0
6,• Served as the Salesforce Administrator for R...,Bullet,"[, served, salesforce, administrator, research...","[, serv, salesforc, administr, research, 1, de...",413,69,"[(•, RB), (Served, NNP), (as, IN), (the, DT), ...",2.6,9.1,5.3,...,0,0,0,0,0,0,0,0,0,0
7,• Responsible for gathering requirements via d...,Bullet,"[, responsible, gathering, requirements, via, ...","[, respons, gather, requir, via, document, que...",305,42,"[(•, NN), (Responsible, NNP), (for, IN), (gath...",7.8,0.0,3.0,...,0,0,0,0,0,0,0,0,0,0
8,• Responsible for troubleshooting and resolvin...,Bullet,"[, responsible, troubleshooting, resolving, te...","[, respons, troubleshoot, resolv, technic, iss...",355,56,"[(•, NN), (Responsible, NNP), (for, IN), (trou...",11.7,11.7,5.1,...,0,0,0,0,0,0,0,0,0,0
9,Oracle ...,Unknown,"[oracle, solutions, architect, august, 2013, p...","[oracl, solut, architect, august, 2013, present]",43,7,"[(Oracle, NNP), (Solutions, NNP), (Architect, ...",14.3,0.0,2.3,...,0,0,0,0,0,0,0,0,1,0


In [4]:
# Data types
data.dtypes

line                       object
label                      object
line_nostop                object
line_stemmed               object
line_length                 int64
word_count                  int64
tagged_line                object
verb_percentage           float64
adj_percentage            float64
stopword_percentage       float64
punctuation_percentage    float64
number_percentage         float64
proper_noun_percentage    float64
Award                       int64
Bullet                      int64
Certification               int64
Contact                     int64
Date                        int64
Degree                      int64
Heading                     int64
Interest                    int64
Location                    int64
Name                        int64
Organization                int64
Skill                       int64
Summary                     int64
Title                       int64
Training                    int64
Unknown                     int64
Volunteer     

In [5]:
# See distrobution of target variable ('Bullet')
data['Bullet'].value_counts()

0    627
1    379
Name: Bullet, dtype: int64

In [6]:
# Drop non-numeric columns


cat_feat = ['line', 'label','line_nostop','line_stemmed','tagged_line']
numeric_only = data.drop(cat_feat, axis=1)
numeric_only

Unnamed: 0,line_length,word_count,verb_percentage,adj_percentage,stopword_percentage,punctuation_percentage,number_percentage,proper_noun_percentage,Award,Bullet,...,Interest,Location,Name,Organization,Skill,Summary,Title,Training,Unknown,Volunteer
0,13,2,0.0,0.0,0.0,0.0,0.0,100.0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,57,7,0.0,7.7,0.0,10.5,23.1,30.8,0,0,...,0,0,0,0,0,0,0,0,0,0
2,698,104,4.1,4.1,4.9,2.7,2.5,12.3,0,0,...,0,0,0,0,0,1,0,0,0,0
3,22,2,0.0,50.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,67,9,10.0,0.0,1.5,3.0,20.0,40.0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,22,4,0.0,0.0,0.0,4.5,40.0,40.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1002,62,12,7.1,0.0,4.8,4.8,14.3,0.0,0,1,...,0,0,0,0,0,0,0,0,0,0
1003,9,1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,73,10,0.0,0.0,1.4,5.5,14.3,42.9,0,0,...,0,0,0,0,0,0,0,0,0,0


## Write out all data sets
Make a copy of all data sets. 
Separate numeric sets for ease of use in feature engineering


In [7]:
data.to_csv('../output/clean_data.csv')
numeric_only.to_csv('../output/numeric_only.csv',index=False)