In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import spacy
import gensim

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('resume.csv')
df = df.dropna(subset=['Resume_str'])

In [3]:
data = df.Resume_str.values.tolist()
data = [re.sub(r'\s+', ' ', sent) for sent in data]
data = [re.sub(r"\'", "", sent) for sent in data]
data = [re.sub('\w*\d\w*', ' ', sent) for sent in data]

In [4]:
data_words = []
for sentence in data:
    word = gensim.utils.simple_preprocess(str(sentence), deacc=True)
    data_words.append(word)

In [5]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [6]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB'])

In [7]:
vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [None]:
lda_model = LatentDirichletAllocation(n_components=20, random_state=20)
lda_model.fit(data_vectorized)
lda_output = lda_model.fit_transform(data_vectorized)

In [None]:
print("Log Likelihood: ", lda_model.score(data_vectorized))
print("Perplexity: ", lda_model.perplexity(data_vectorized))
lda_model.get_params()

In [None]:
vis_data = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)
pyLDAvis.display(vis_data)
# pyLDAvis.save_html(vis_data, 'lda.html')

In [None]:
topics_dict = {}
for topic_idx, topic in enumerate(lda_model.components_):
    topics_dict["Topic"+str(topic_idx)] = [vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]

In [None]:
topics_to_df = pd.DataFrame(topics_dict).T
topics_to_df

In [None]:
topics = ['Politics','Aviation','Fitness',
          'Services','Teacher','ProductControl',
          'IT','Finance','Construction','HR','Arts',
          'Developer','Engineerer','Accounting','Chefs',
          'Heathcare','Warehouse','Research',
          'Sales','Marketing']

topics_to_df["Topics"] = topics
topics_to_df

In [None]:
from sklearn.cluster import KMeans
clusters = KMeans(n_clusters=20, random_state=20).fit_predict(lda_output)

svd_model = TruncatedSVD(n_components=2)
lda_output_svd = svd_model.fit_transform(lda_output)

x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]

print("Weights:", np.round(svd_model.components_, 2))

print("Variance:", np.round(svd_model.explained_variance_ratio_, 2))

In [None]:
plt.figure(figsize=(12, 12))
plt.scatter(x, y, c=clusters)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title("SVD of Topic Clusters", )

In [None]:
topic_names = ['Topic' + str(i) for i in range(lda_model.n_components)]
topic_keywords = pd.DataFrame(lda_model.components_)
topic_keywords.columns = vectorizer.get_feature_names()
topic_keywords.index = topic_names

keywords = np.array(vectorizer.get_feature_names())
topic_keywords = []
for topic_weights in lda_model.components_:
    top_keyword_locs = (-topic_weights).argsort()[:10]
    topic_keywords.append(keywords.take(top_keyword_locs))
                             
topic_keywords = pd.DataFrame(topic_keywords)
topic_keywords.columns = ['Word '+str(i) for i in range(topic_keywords.shape[1])]
topic_keywords.index = ['Topic '+str(i) for i in range(topic_keywords.shape[0])]

topic_keywords["Topics"]=topics

In [None]:
nlp = spacy.load('en_core_web_sm')

def predict_topic(text, nlp=nlp):
    
    words = []
    for sentence in text:
        word = gensim.utils.simple_preprocess(str(sentence), deacc=True)
        words.append(word)
    
    lemm = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    vector = vectorizer.transform(lemm)
    
    topic_scores = lda_model.transform(vector)
    topic = topic_keywords.iloc[np.argmax(topic_scores), 1:14].values.tolist()
    
    infer_topic = topic_keywords.iloc[np.argmax(topic_scores), -1]
    
    return infer_topic, topic, topic_scores

In [None]:
mytext = [["Managing schedules for interviews and deep search for potential worker."],
         ["Head chef for 8 years and cook over 100 dishes a day under fast paced environment."],
         ["Developing software on daily bases with tight deadlines weekly."],
         ["Objective : Competent, compassionate, and empathetic Staff Nurse with background experience in mental health. Meets responsibility in patient safety, medication safety, coordination of care, and prioritization. Works well under pressure, persistent, determined, and goal oriented. Emotional stability to cope with human suffering, emergencies and other stresses. Offering leadership qualities with a positive attitude. Motivated, hardworking, organized, focused and dedicated. To embrace a career opportunity where my healthcare background and education would be conducive to achieving all goals."]]

for text in mytext:
    infer_topic, topic, prob_scores = predict_topic(text)

    print("------------")
    print(topic)
    print(infer_topic)