In [1]:
import pandas as pd
import os
from bs4 import BeautifulSoup
import requests
import re
import nltk
import textcleaner
import pickle
import spacy
import jieba

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
import textcleaner as tc
from nltk.stem import WordNetLemmatizer 
import re
import spacy
from gensim.parsing.preprocessing import preprocess_string

from src.models import display_topics

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
! pip install --editable ..

Obtaining file:///Users/greenapple/project4
Installing collected packages: src
  Found existing installation: src 0.1.0
    Uninstalling src-0.1.0:
      Successfully uninstalled src-0.1.0
  Running setup.py develop for src
Successfully installed src


## Load data 

In [4]:
# Load tokenized text
pickling_out = open('/Users/greenapple/project4/data/processed/jobs_tokenized.pkl', 'rb')
jobs = pickle.load(pickling_out)

In [5]:
jobs.shape

(8939, 12)

In [7]:
jobs.columns

Index(['company_name', 'description', 'job_title', 'link', 'location',
       'salary', 'type', 'clean_text', 'noun', 'noun_lemma', 'lemma', 'word'],
      dtype='object')

## Topic modeling

In [6]:
# Data scientist/analyst jobs only
jobs_d = jobs.loc[jobs.type=='positive']
jobs_d.shape, jobs.shape

((5058, 12), (8939, 12))

In [7]:
jobs.columns

Index(['company_name', 'description', 'job_title', 'link', 'location',
       'salary', 'type', 'clean_text', 'noun', 'noun_lemma', 'lemma', 'word'],
      dtype='object')

### Noun as a token

#### TF-IDF and LSA with Noun as token

In [55]:
from src.models.stop_words_list_1 import stop_words_1

In [60]:
# Model name: ***noun_tf_idf_LSA_1***
# Tokenizer: noun
# Vectorizer: TF-IDF
# Dimensionality reduction: LSA

tf_idf_vc_1 = TfidfVectorizer(ngram_range=(1, 2),
                            max_df = 0.7,
                            min_df = 0.005,
                            stop_words = stop_words_1)   

components = 20  # number of LSA components
lsa_1 = TruncatedSVD(components, random_state = 5)


topics = lsa_1.fit_transform(tf_idf_vc_1.fit_transform(jobs_d.noun).toarray())  # reduce dimensionality

print(len(tf_idf_vc_1.vocabulary_))   # number of words in dictionary
print(lsa_1.explained_variance_ratio_)      # explained variance

# pd.DataFrame(tf_idf_vc_1.fit_transform(jobs.noun).toarray(), columns=tf_idf_vc_1.get_feature_names()).head(10)

# lsa_1.components_
# tf_idf_vc_1.get_feature_names()

noun_tf_idf_LSA_1 = display_topics.topics_to_df(lsa_1, tf_idf_vc_1.get_feature_names(), 5, 'noun_tf_idf_LSA_1')  # save topics

pd.set_option('max_colwidth', 200)  # display strings
noun_tf_idf_LSA_1

  'stop_words.' % sorted(inconsistent))


2689
[0.07273745 0.11360044 0.07922137 0.07281772 0.07521817 0.0727262
 0.06721486 0.06567128 0.04586941 0.03427465 0.02969788 0.02876408
 0.02479424 0.02367738 0.01656202 0.01502156 0.01387867 0.01330501
 0.00869666 0.00440473]


Unnamed: 0,noun_tf_idf_LSA_1
0,"statistics, machine, modeling, engineers, warehouse"
1,"warehouse, database, azure, databricks, pipelines"
2,"game, visualization, sql, visualization taxonomy, experimentation breadth"
3,"clients, transportation, management, recommendations, splunk"
4,"production, splunk, ididata, resolution algorithms, waterfront views"
5,"airlines, information, splunk, visualization, energy"
6,"query, knowledge operation, system problem, questions information, errors engineers"
7,"splunk, concepts splunk, deployment management, telecommunications, communications"
8,"energy, energy energy, forecasting, cloud, simulation"
9,"documentation, training, awc, documentation training, security"


In [14]:
# Save


#### TF-IDF and NMF with Noun as token

In [22]:
# Model name: ***noun_tf_idf_NMF_1***
# Tokenizer: noun
# Vectorizer: TF-IDF
# Dimensionality reduction: LSA

tf_idf_vc_1 = TfidfVectorizer(ngram_range=(1, 2),
                            max_df = 0.5,
                            min_df = 0.005,
                            stop_words = stop_words_1
                             )

components = 10  # number of components
nmf_1 = NMF(10, random_state = 5)

topics = nmf_1.fit_transform(tf_idf_vc_1.fit_transform(jobs.noun).toarray())  # reduce dimensionality

print(len(tf_idf_vc_1.vocabulary_))   # number of words in dictionary

# pd.DataFrame(tf_idf_vc_1.fit_transform(jobs.noun).toarray(), columns=tf_idf_vc_1.get_feature_names()).head(10)

# lsa_1.components_
# tf_idf_vc_1.get_feature_names()

noun_tf_idf_NMF_1 = display_topics.topics_to_df(nmf_1, tf_idf_vc_1.get_feature_names(), 5, 'noun_tf_idf_NMF_1')  # save topics

pd.set_option('max_colwidth', 200)  # display strings
noun_tf_idf_NMF_1

  'stop_words.' % sorted(inconsistent))


5777


Unnamed: 0,noun_tf_idf_NMF_1
0,"statistics, scalability, modeling, machine, techniques"
1,"warehouse, database, databricks, azure, pipelines"
2,"engineering, water, traffic, drawings, construction"
3,"player, game, visualization, sql, techniques sql"
4,"cell, patients, medicine, cell therapy, therapy"
5,"clients, transportation, recommendations, member blend, market landscape"
6,"platform, production, ididata, offerings capabilities, hood apps"
7,"model, machine, visualization, airlines, tools"
8,"query, entities, problem concise, information entities, guideline ambiguities"
9,"clients, autocad, engineer, building, project management"


In [29]:
# Save
columns=['LSA', 'NMF']
noun_topics = pd.concat([noun_tf_idf_LSA_1, noun_tf_idf_NMF_1], axis=1)

In [30]:
noun_topics

Unnamed: 0,noun_tf_idf_LSA_1,noun_tf_idf_NMF_1
0,"statistics, machine, modeling, techniques, results","statistics, scalability, modeling, machine, techniques"
1,"warehouse, database, azure, databricks, pipelines","warehouse, database, databricks, azure, pipelines"
2,"engineering, transportation, clients, water, traffic","engineering, water, traffic, drawings, construction"
3,"player, game, visualization, sql, sql transform","player, game, visualization, sql, techniques sql"
4,"cell, patients, medicine, cell therapy, therapy","cell, patients, medicine, cell therapy, therapy"
5,"clients, transportation, recommendations, businesses playerposition, clients competitors","clients, transportation, recommendations, member blend, market landscape"
6,"platform, production, splunk, points, businesses","platform, production, ididata, offerings capabilities, hood apps"
7,"information, model, airlines, study, tools","model, machine, visualization, airlines, tools"
8,"query, entities, ambiguities engineers, holidayswe, judge","query, entities, problem concise, information entities, guideline ambiguities"
9,"building, project management, initiative, autocad, matching profit","clients, autocad, engineer, building, project management"


### Lemma as a token

#### TF-IDF and LSA

In [44]:
# Model name: ***lemma_tf_idf_LSA_1***
# Tokenizer: noun
# Vectorizer: TF-IDF
# Dimensionality reduction: LSA

tf_idf_vc_2 = TfidfVectorizer(ngram_range=(1, 2),
                            max_df = 0.5,
                            min_df = 0.005,
                            stop_words = stop_words_1)   

components = 10  # number of LSA components
lsa_2 = TruncatedSVD(components, random_state = 5, algorithm='arpack')


topics = lsa_2.fit_transform(tf_idf_vc_2.fit_transform(jobs.lemma).toarray())  # reduce dimensionality

print(len(tf_idf_vc_2.vocabulary_))   # number of words in dictionary
print(lsa_2.explained_variance_ratio_)      # explained variance

# pd.DataFrame(tf_idf_vc_1.fit_transform(jobs.noun).toarray(), columns=tf_idf_vc_1.get_feature_names()).head(10)

# lsa_1.components_
# tf_idf_vc_1.get_feature_names()

lemma_tf_idf_LSA = display_topics.topics_to_df(lsa_2, tf_idf_vc_2.get_feature_names(), 5, 'lemma_tf_idf_LSA')  # save topics

pd.set_option('max_colwidth', 200)  # display strings
lemma_tf_idf_LSA

  'stop_words.' % sorted(inconsistent))


11026
[0.03420972 0.06213072 0.04395861 0.0493622  0.04790312 0.04327154
 0.04178091 0.0403453  0.03834841 0.03721194]


Unnamed: 0,lemma_tf_idf_LSA
0,"technical, model, broad, big, statistic"
1,"excell, warehouse, big, database, mart report"
2,"excell, warehouse, technical, broad, mart report"
3,"game, big game, player, visualization, big"
4,"patient, clinical, cell, celgene, therapy"
5,"savvy, clear, interpret, drive, transportation"
6,"red, proprietary, analytic, will, distribute"
7,"model, air, alaska, azure, horizon air"
8,"judge, ambiguity, query, guideline, bellevue"
9,"porter, lundeen, porter lundeen, university, sector"


### Word as a token

In [45]:
# Model name: ***lemma_tf_idf_LSA_1***
# Tokenizer: noun
# Vectorizer: TF-IDF
# Dimensionality reduction: LSA

tf_idf_vc_3 = TfidfVectorizer(ngram_range=(1, 2),
                            max_df = 0.5,
                            min_df = 0.005,
                            stop_words = stop_words_1)   

components = 10  # number of LSA components
lsa_3 = TruncatedSVD(components, random_state = 5, algorithm='arpack')


topics = lsa_3.fit_transform(tf_idf_vc_3.fit_transform(jobs.word).toarray())  # reduce dimensionality

print(len(tf_idf_vc_3.vocabulary_))   # number of words in dictionary
print(lsa_3.explained_variance_ratio_)      # explained variance

# pd.DataFrame(tf_idf_vc_1.fit_transform(jobs.noun).toarray(), columns=tf_idf_vc_1.get_feature_names()).head(10)

# lsa_1.components_
# tf_idf_vc_1.get_feature_names()

word_tf_idf_LSA = display_topics.topics_to_df(lsa_3, tf_idf_vc_3.get_feature_names(), 5, 'word_tf_idf_LSA')  # save topics

pd.set_option('max_colwidth', 200)  # display strings
word_tf_idf_LSA

  'stop_words.' % sorted(inconsistent))


11592
[0.03429688 0.06279066 0.04170626 0.04886452 0.04723526 0.04296753
 0.04146705 0.04031894 0.03780808 0.03676268]


Unnamed: 0,word_tf_idf_LSA
0,"technical, broad, modeling, big, machine"
1,"excell, big, warehouse, database, mart reports"
2,"excell, warehouse, database, mart reports, mart"
3,"gaming, big, player, play, game"
4,"clinical, cell, celgene, patient, patients"
5,"savvy, clients, clear, interpret, collect interpret"
6,"red, proprietary, distributed, ll, highly"
7,"alaska airlines, airlines, air, alaska, azure"
8,"judge, analyze, good, bellevue, query"
9,"lundeen, porter, porter lundeen, university, paid"


### Noun-lemma as a token

In [74]:
# Model name: ***lemma_tf_idf_LSA_1***
# Tokenizer: noun
# Vectorizer: TF-IDF
# Dimensionality reduction: LSA

tf_idf_vc_4 = TfidfVectorizer(ngram_range=(1, 2),
                            max_df = 0.8,
                            min_df = 0.005,
                            stop_words = stop_words_1)   

components = 15  # number of LSA components
lsa_4 = TruncatedSVD(components, random_state = 5, algorithm='arpack')


topics = lsa_4.fit_transform(tf_idf_vc_4.fit_transform(jobs.noun_lemma).toarray())  # reduce dimensionality

print(len(tf_idf_vc_4.vocabulary_))   # number of words in dictionary
print(lsa_4.explained_variance_ratio_)      # explained variance

# pd.DataFrame(tf_idf_vc_1.fit_transform(jobs.noun).toarray(), columns=tf_idf_vc_1.get_feature_names()).head(10)

# lsa_1.components_
# tf_idf_vc_1.get_feature_names()

noun_lemma_tf_idf_LSA = display_topics.topics_to_df(lsa_4, tf_idf_vc_4.get_feature_names(), 5, 'noun_lemma_tf_idf_LSA')  # save topics

pd.set_option('max_colwidth', 200)  # display strings
noun_lemma_tf_idf_LSA

  'stop_words.' % sorted(inconsistent))


5458
[0.02979828 0.06304829 0.0473965  0.0499364  0.04744858 0.04241423
 0.04185172 0.03978371 0.03832084 0.03676663 0.03633249 0.03562935
 0.02994336 0.02949124 0.02751397]


Unnamed: 0,noun_lemma_tf_idf_LSA
0,"model, project, solution, statistic, product"
1,"warehouse, database, process warehouse, azure, databrick"
2,"warehouse, process warehouse, databrick, pipeline, azure"
3,"game, player, visualization, game opportunity, sql"
4,"cell, patient, therapy, medicine, cell therapy"
5,"transportation, trend, recommendation, audience, method"
6,"analytic, problem platform, platform, production, ididata"
7,"query, ambiguity, guideline, information, guideline ambiguity"
8,"model, airline, visualization, machine, process"
9,"project management, model, autocad, execution, member excellence"


### Stop words

In [9]:
stop_words_n = [
    'yearexperience', 'year', 'years', 'porch', 'work', 'home', 'term', 'datum', 'hand', 'science', 'other', 
    're', 'sex', 'gender', 'data scientist', 'data scientists', 'scientists','scientist', 'com', 'age', 
    'analyst', 'analysts', 'race','hourexperience', 'job type', 'end', 'employee', 'employees','employers', 
    'employer', 'job', 'career', 'fish', 'violet','opportunities', 'business', 'disability', 'company', 
    'companies', 'stakeholders', 'talent', 'skill', 'skills', 'team', 'teams', 'experience', 'expert', 'hands', 
    'games', 'players', 'creativity','models', 'roles', 'application', 'specifications', 'change','issues', 
    'search', 'status', 'impact', 'changes', 'location', 'detail', 'insights', 'document', 'client', 'sets', 
    'set', 'detail', 'analytics', 'fusion','applications', 'candidates','standards', 'manager', 'assets', 
    'health', 'accommodation', 'services', 'service', 'transwest', 'program', 'commute', 'utilization',
    'technology', 'solutions', 'part', 'success', 'findings', 'notes', 'satisfaction', 'case', 'group', 'redviolet', 'trends', 'optimization',
    'others', 'account', 'strategy', 'bike parking', 'account', 'performance','preduction', 'relationship', 
    'waterfront views', 'manage', 'quality', 'type time', 'problems', 'projects', 'people', 'collaboration', 
    'strategies', 'programs', 'partner', 'core', 'operations', 'rsu', 'plans', 'methods', 'variety', 'revenue', 
    'data', 'technologies', 'visualizations', 'type', 'life', 'safety', 'analysis', 'time', 'design', 'research', 
    'relevance', 'passion', 'customer', 'learning', 'environment', 'tasks','family', 'benefits', 'development', 
    'complete', 'industry', 'user', 'organizations', 'customers', 'documentations', 'members', 'competencies',
    'homeowners', 'networking', 'apply', 'hire', 'homeowner', 'jobs', 'firm', 'position', 'entities',
    'tools', 'initiative', 'servicekey', 'intents', 'merit','bonuses', 'paid', 'asset', 'candidate', 
    'building', 'sector', 'employment', 'stakeholder', 'note', 'entity', 'finding', 'staff', 'drawing',
    'award', 'tool', 'latitude', 'view', 'button', 'matter', 'method', 'professionals', 'techniques',
    'states', 'house', 'holidayswe', 'handyman', 'requirementsperform', 'opportunity'
]

In [51]:
stop_words_2 = [
    'yearexperience', 'year', 'porch', 'work', 'home', 'term', 'datum', 'hand', 'science', 'other', 
    're', 'sex', 'gender', 'data scientist', 'scientist', 'com', 'age', 
    'analyst', 'race','hourexperience', 'job type', 'end', 'employee',
    'employer', 'job', 'career', 'fish', 'violet','opportunity', 'business', 'disability', 'company', 
    'stakeholders', 'talent', 'skill', 'team', 'experience', 'expert', 
    'player', 'creativity', 'role', 'application', 'specification', 'change','issue', 
    'search', 'status', 'impact', 'changes', 'location', 'detail', 'insight', 'document', 'client',
    'set', 'detail', 'analytics', 'fusion','applications', 'candidates','standards', 'manager', 'asset', 
    'health', 'accommodation', 'service', 'transwest', 'program', 'commute', 'utilization',
    'technology', 'solution', 'part', 'success', 'finding', 'satisfaction', 'case', 'group', 'redviolet', 'trends', 'optimization',
    'account', 'strategy', 'bike parking', 'account', 'performance','preduction', 'relationship', 
    'waterfront view', 'manage', 'quality', 'type time', 'people', 'collaboration', 
    'partner', 'core', 'variety', 'revenue', 
    'data', 'type', 'life', 'safety', 'analysis', 'time', 'design', 'research', 
    'relevance', 'passion', 'customer', 'learning', 'environment', 'task','family', 'benefit', 'development', 
    'complete', 'industry', 'day', 'sector'
]

In [19]:
stop_words_3 = []

## Functions

In [17]:
%%writefile '/Users/greenapple/project4/src/models/display_topics.py'

import pandas as pd

def topics_to_df(model, feature_names, no_top_words, model_name):
    '''
    Add topic words for dimensionality reduction to dataframe.
    '''
    df = pd.DataFrame()
    words_list = []
    
    for ix, topic in enumerate(model.components_):
        words = ", ".join([feature_names[i]for i in topic.argsort()[:-no_top_words - 1:-1]])
        words_list.append(words)
    
    df[model_name] = pd.Series(words_list)
     
    return df

Overwriting /Users/greenapple/project4/src/models/display_topics.py
