In [2]:
import numpy as np
import pandas as pd
import re
import nltk 
import spacy

In [3]:
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [4]:
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
#1# read data
df = pd.read_csv('home_marketing_job.csv')

In [6]:
df.head()

Unnamed: 0,Uniq Id,Crawl Timestamp,Job Title,Job Salary,Job Experience Required,Key Skills,Role Category,Location,Functional Area,Industry,Role
0,9be62c49a0b7ebe982a4af1edaa7bc5f,2019-07-05 01:46:07 +0000,Digital Media Planner,Not Disclosed by Recruiter,5 - 10 yrs,Media Planning| Digital Media,Advertising,Mumbai,"Marketing , Advertising , MR , PR , Media Plan...","Advertising, PR, MR, Event Management",Media Planning Executive/Manager
1,3c52d436e39f596b22519da2612f6a56,2019-07-06 08:04:50 +0000,Online Bidding Executive,Not Disclosed by Recruiter,2 - 5 yrs,pre sales| closing| software knowledge| clien...,Retail Sales,"Pune,Pune","Sales , Retail , Business Development","IT-Software, Software Services",Sales Executive/Officer
2,ffad8a2396c60be2bf6d0e2ff47c58d4,2019-08-05 15:50:44 +0000,Trainee Research/ Research Executive- Hi- Tec...,Not Disclosed by Recruiter,0 - 1 yrs,Computer science| Fabrication| Quality check|...,R&D,Gurgaon,"Engineering Design , R&D","Recruitment, Staffing",R&D Executive
3,7b921f51b5c2fb862b4a5f7a54c37f75,2019-08-05 15:31:56 +0000,Technical Support,"2,00,000 - 4,00,000 PA.",0 - 5 yrs,Technical Support,Admin/Maintenance/Security/Datawarehousing,Mumbai,"IT Software - Application Programming , Mainte...","IT-Software, Software Services",Technical Support Engineer
4,2d8b7d44e138a54d5dc841163138de50,2019-07-05 02:48:29 +0000,Software Test Engineer -hyderabad,Not Disclosed by Recruiter,2 - 5 yrs,manual testing| test engineering| test cases|...,Programming & Design,Hyderabad,IT Software - QA & Testing,"IT-Software, Software Services",Testing Engineer


In [7]:
#2# explore tokens

s = df[0:100]
s =s.to_string()


nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(s)

## Tokenize and Clean-up
job4 = re.findall(r"[^!.? ]+",s)
print(job4)

['Uniq', 'Id', 'Crawl', 'Timestamp', 'Job', 'Title', 'Job', 'Salary', 'Job', 'Experience', 'Required', 'Key', 'Skills', 'Role', 'Category', 'Location', 'Functional', 'Area', 'Industry', 'Role\n0', '9be62c49a0b7ebe982a4af1edaa7bc5f', '2019-07-05', '01:46:07', '+0000', 'Digital', 'Media', 'Planner', 'Not', 'Disclosed', 'by', 'Recruiter', '5', '-', '10', 'yrs', 'Media', 'Planning|', 'Digital', 'Media', 'Advertising', 'Mumbai', 'Marketing', ',', 'Advertising', ',', 'MR', ',', 'PR', ',', 'Media', 'Planning', 'Advertising,', 'PR,', 'MR,', 'Event', 'Management', 'Media', 'Planning', 'Executive/Manager\n1', '3c52d436e39f596b22519da2612f6a56', '2019-07-06', '08:04:50', '+0000', 'Online', 'Bidding', 'Executive', 'Not', 'Disclosed', 'by', 'Recruiter', '2', '-', '5', 'yrs', 'pre', 'sales|', 'closing|', 'software', 'knowledge|', 'clients|', 'requirements|', 'negotiating|', 'client|', 'online', 'bidding|', 'good', 'communication|', 'technology', 'Retail', 'Sales', 'Pune,Pune', 'Sales', ',', 'Retail'

In [8]:
print(doc)

                             Uniq Id            Crawl Timestamp                                                                 Job Title                       Job Salary Job Experience Required                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Key Skills                               Role Category                                                                                Location                                           Functional Ar

In [9]:
#3# Stemming

# Import the toolkit and the full Porter Stemmer library
import nltk
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')
#Instead, we'll use another popular NLP tool called nltk, which stands for Natural Language Toolkit. 
for word in re.findall(r"[^!.? ]+",s):
    print(word+' --> '+s_stemmer.stem(word))

Uniq --> uniq
Id --> id
Crawl --> crawl
Timestamp --> timestamp
Job --> job
Title --> titl
Job --> job
Salary --> salari
Job --> job
Experience --> experi
Required --> requir
Key --> key
Skills --> skill
Role --> role
Category --> categori
Location --> locat
Functional --> function
Area --> area
Industry --> industri
Role
0 --> role
0
9be62c49a0b7ebe982a4af1edaa7bc5f --> 9be62c49a0b7ebe982a4af1edaa7bc5f
2019-07-05 --> 2019-07-05
01:46:07 --> 01:46:07
+0000 --> +0000
Digital --> digit
Media --> media
Planner --> planner
Not --> not
Disclosed --> disclos
by --> by
Recruiter --> recruit
5 --> 5
- --> -
10 --> 10
yrs --> yrs
Media --> media
Planning| --> planning|
Digital --> digit
Media --> media
Advertising --> advertis
Mumbai --> mumbai
Marketing --> market
, --> ,
Advertising --> advertis
, --> ,
MR --> mr
, --> ,
PR --> pr
, --> ,
Media --> media
Planning --> plan
Advertising, --> advertising,
PR, --> pr,
MR, --> mr,
Event --> event
Management --> manag
Media --> media
Planning --> pl

2019-08-05 --> 2019-08-05
22:21:30 --> 22:21:30
+0000 --> +0000
Assistant --> assist
Professor --> professor
(Grade --> (grade
I)-Theoretical --> i)-theoret
Computer --> comput
Science --> scienc
Not --> not
Disclosed --> disclos
by --> by
Recruiter --> recruit
3 --> 3
- --> -
5 --> 5
yrs --> yrs
NaN --> nan
NaN --> nan
Delhi --> delhi
Teaching --> teach
, --> ,
Education --> educ
, --> ,
Training --> train
, --> ,
Counselling --> counsel
Education, --> education,
Teaching, --> teaching,
Training --> train
NaN
32 --> nan
32
986c0566fcae55da5238c1c449083307 --> 986c0566fcae55da5238c1c449083307
2019-08-05 --> 2019-08-05
06:28:20 --> 06:28:20
+0000 --> +0000
US --> us
Based --> base
Financial --> financi
Company --> compani
Looking --> look
For --> for
Java --> java
API --> api
Engineers --> engin
For --> for
HYD --> hyd
Not --> not
Disclosed --> disclos
by --> by
Recruiter --> recruit
6 --> 6
- --> -
8 --> 8
yrs --> yrs
Java| --> java|
java --> java
api| --> api|
Banking| --> banking|
j2

, --> ,
Customer --> custom
Service --> servic
, --> ,
Operations --> oper
BPO, --> bpo,
Call --> call
Centre, --> centre,
ITeS --> ite
Associate/Senior --> associate/senior
Associate --> associ
-(NonTechnical)
68 --> -(nontechnical)
68
8a651ac4718eb479f1bd99f50cb0da03 --> 8a651ac4718eb479f1bd99f50cb0da03
2019-08-05 --> 2019-08-05
21:47:54 --> 21:47:54
+0000 --> +0000
Sales --> sale
Manager --> manag
- --> -
Education --> educ
Sales --> sale
Not --> not
Disclosed --> disclos
by --> by
Recruiter --> recruit
5 --> 5
- --> -
7 --> 7
yrs --> yrs
Market --> market
analysis| --> analysis|
Public --> public
relations| --> relations|
CV| --> cv|
Relationship --> relationship
building| --> building|
Market --> market
intelligence| --> intelligence|
Prospecting| --> prospecting|
Budgeting| --> budgeting|
Sales --> sale
process| --> process|
Branding --> brand
Retail --> retail
Sales --> sale
Gurgaon --> gurgaon
Sales --> sale
, --> ,
Retail --> retail
, --> ,
Business --> busi
Development --> de

Training --> train
Manager
91 --> manager
91
b19152b9e48560280692ea1b1916abaf --> b19152b9e48560280692ea1b1916abaf
2019-07-06 --> 2019-07-06
00:10:43 --> 00:10:43
+0000 --> +0000
DevOps --> devop
Engineer --> engin
Intern --> intern
Not --> not
Disclosed --> disclos
by --> by
Recruiter --> recruit
0 --> 0
- --> -
1 --> 1
yrs --> yrs
Intern| --> intern|
Linux| --> linux|
Windows| --> windows|
Unix| --> unix|
DNS| --> dns|
C++| --> c++|
Perl| --> perl|
IIS| --> iis|
Redhat| --> redhat|
HTTP --> http
Programming --> program
& --> &
Design --> design
Hyderabad --> hyderabad
IT --> it
Software --> softwar
- --> -
Application --> applic
Programming --> program
, --> ,
Maintenance --> mainten
IT-Software, --> it-software,
Software --> softwar
Services --> servic
Software --> softwar
Developer
92 --> developer
92
f725bad4069552f4780a50b84c29bf5e --> f725bad4069552f4780a50b84c29bf5
2019-08-04 --> 2019-08-04
05:04:36 --> 05:04:36
+0000 --> +0000
PHP --> php
Developer --> develop
(wordpress --> (

In [10]:
#4# lemmatization

def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')
        
show_lemmas(doc)

                              SPACE  141172199774263734                                  
Uniq         PROPN  113935123914199583     Uniq
I            PRON   561228191312463089     -PRON-
d            VERB   6992604926141104606    would
             SPACE  14267765035303216250              
Crawl        PROPN  17475866996521103841   Crawl
Timestamp    PROPN  3548733037921963186    Timestamp
                                                                 SPACE  1056688691105161902                                                                    
Job          PROPN  16842181437347778509   Job
Title        PROPN  10170869889483782475   Title
                       SPACE  15641042832978725572                         
Job          PROPN  16842181437347778509   Job
Salary       PROPN  13774318443957935319   Salary
Job          PROPN  16842181437347778509   Job
Experience   PROPN  13723185953024897186   Experience
Required     VERB   9094995078650759376    require
                         

by           ADP    16764210730586636600   by
Recruiter    PROPN  3978178878715706437    Recruiter
                SPACE  7953425738349384079                   
2            NUM    15180167692696242062   2
-            SYM    9153284864653046197    -
7            NUM    2462676316711722248    7
yrs          PROPN  16517339820909808447   yrs
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         SPACE  15994185036859790891                                                             

Ageny        PROPN  9970829402731332525    Ageny
             SPACE  6198647145348743278            
3,50,000     NUM    2019892225472379021    3,50,000
-            SYM    9153284864653046197    -
5,00,000     PROPN  8119051773336103423    5,00,000
PA           PROPN  17082056823169685079   PA
.            PUNCT  12646065887601541794   .
                SPACE  7953425738349384079                   
2            NUM    15180167692696242062   2
-            SYM    9153284864653046197    -
4            NUM    16743743820210141046   4
yrs          PROPN  16517339820909808447   yrs
                                                                                                                                                                                                                                                                                                                                                                                                                               

06           NUM    15480929925469224738   06
19:30:24     NUM    3134691333428595095    19:30:24
+0000        PROPN  11652292094536557755   +0000
                                                    SPACE  13376128391297292775                                                      
Tooling      PROPN  14011291064984823509   Tooling
&            CCONJ  15473034735919704609   &
amp          NOUN   636171854864380074     amp
;            PUNCT  631425121691394544     ;
Sampling     PROPN  1071801901468596395    Sampling
             SPACE  17579141535385064505        
Not          PART   447765159362469301     not
Disclosed    VERB   10594653596137719792   disclose
by           ADP    16764210730586636600   by
Recruiter    PROPN  3978178878715706437    Recruiter
                SPACE  7953425738349384079                   
2            NUM    15180167692696242062   2
-            SYM    9153284864653046197    -
7            NUM    2462676316711722248    7
yrs          PROPN  165173398209098

Software     NOUN   8212201967714533330    software
-            PUNCT  9153284864653046197    -
Application  PROPN  10647088555044889902   Application
Programming  NOUN   17860067660221736314   programming
,            PUNCT  2593208677638477497    ,
Maintenance  PROPN  14624967345569303655   Maintenance
                           SPACE  7978323633892547218                              
IT           PROPN  15566906646452856019   IT
-            PUNCT  9153284864653046197    -
Software     PROPN  5476674789249212452    Software
,            PUNCT  2593208677638477497    ,
Software     PROPN  5476674789249212452    Software
Services     PROPN  16271719928879294026   Services
                     SPACE  7673489056978492252                        
Project      PROPN  5641902175427286826    Project
Manager      PROPN  2468878231143206133    Manager
-            PUNCT  9153284864653046197    -
IT           PROPN  15566906646452856019   IT
/            SYM    2466615745961202537    /
Softwar

&            CCONJ  15473034735919704609   &
Design       PROPN  15658042274746311269   Design
                                                                                   SPACE  2788654531869237747                                                                                      
Noida        PROPN  2909370794944056140    Noida
             SPACE  5261714186835361783          
IT           PROPN  15566906646452856019   IT
Software     PROPN  5476674789249212452    Software
-            PUNCT  9153284864653046197    -
Application  PROPN  10647088555044889902   Application
Programming  NOUN   17860067660221736314   programming
,            PUNCT  2593208677638477497    ,
Maintenance  PROPN  14624967345569303655   Maintenance
                           SPACE  7978323633892547218                              
IT           PROPN  15566906646452856019   IT
-            PUNCT  9153284864653046197    -
Software     PROPN  5476674789249212452    Software
,            PUNCT  2593208677

07           NUM    8608566706881265128    07
-            PUNCT  9153284864653046197    -
06           NUM    15480929925469224738   06
17:39:24     NUM    10422815480868370444   17:39:24
+0000        PROPN  11652292094536557755   +0000
                                                              SPACE  10111964071700426315                                                                
nutanix      PROPN  9025292247852043123    nutanix
sme          NOUN   7849908647976580842    sme
             SPACE  5261714186835361783          
Not          PART   447765159362469301     not
Disclosed    VERB   10594653596137719792   disclose
by           ADP    16764210730586636600   by
Recruiter    PROPN  3978178878715706437    Recruiter
              SPACE  16915180186461182737                
15           NUM    13771760024209633521   15
-            SYM    9153284864653046197    -
20           NUM    2767521681098075859    20
yrs          PROPN  16517339820909808447   yrs
                    

Relationship PROPN  13793976295823118227   Relationship
management|  X      8283676106163268552    management|
Relationship PROPN  13793976295823118227   Relationship
building|    PUNCT  15024992079802018305   building|
Client       PROPN  224502045523635432     Client
relationship| NOUN   6020071441476548453    relationship|
Training|    PROPN  5343194628557300996    Training|
Analytical   PROPN  504203105799249677     Analytical
skills|      PUNCT  16057332744796863856   skills|
Process      NOUN   1020421249059553464    process
mapping|     PUNCT  6961290292970949908    mapping|
Standard     ADJ    4892528069537811086    standard
operating    VERB   9854540405589975553    operate
procedures|  X      6838092719103140740    procedures|
Agile|       NUM    9058917983754494303    agile|
Management|  PROPN  6254100332558245957    Management|
Visa         PROPN  11856115888332719075   Visa
processing   NOUN   10935198773122488114   processing
                                       SPACE  

                SPACE  7953425738349384079                   
1            NUM    5533571732986600803    1
-            SYM    9153284864653046197    -
4            NUM    16743743820210141046   4
yrs          PROPN  16517339820909808447   yrs
                                                                                                                                                                                                                                                                                                                                                                                                                                   SPACE  18221098391602763135                                                                                                                                                                                                                                                                                                                      

LPO          PROPN  11100355490290156305   LPO
,            PUNCT  2593208677638477497    ,
Customer     PROPN  10853472433698617114   Customer
Service      PROPN  1069327236522697900    Service
,            PUNCT  2593208677638477497    ,
Operations   PROPN  18219273888067973519   Operations
                                   SPACE  852416194066011836                                       
BPO          PROPN  5770954012531429021    BPO
,            PUNCT  2593208677638477497    ,
Call         PROPN  11104957948539619373   Call
Centre       PROPN  12488127276277555940   Centre
,            PUNCT  2593208677638477497    ,
ITeS         PROPN  13117855081430664367   ITeS
                     SPACE  7673489056978492252                        
Team         PROPN  6768431954567747701    Team
Leader       PROPN  13053554208341782404   Leader
-(NonTechnical PROPN  7793912885786141654    -(NonTechnical
)            PUNCT  3842344029291005339    )

            SPACE  962983613142996970     

53 

5            NUM    2090661578966068036    5
yrs          PROPN  16517339820909808447   yrs
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     SPACE  10961078142318150757                                                                                                                                                                                                                                                                                                                                                            

NaN          PROPN  3182084331818062233    NaN
             SPACE  8532415787641010193     
Manager|Transformation|Manager|Transformation|Manager|Manager|Manager|Manager|Transformation|Manager|Transformation|Manager|Manager|Transformation|Manager|Manager|Transformation|Transformation|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Transformation|Manager|Manager|Manager|Manager|Manager|Manager|Transformation|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Transformation|Manager|Transformation|Manager|Manager|Manager|Transformation|Manager|Transformation|Manager|Manager|Transformation|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Transformation PROPN  7308814468445498064    Manager|Transformation|Manager|Transformation|Manager|Manager|Manager|Manager|Transformation|Manager|Transformation|Manager|Manager|Transformation|Manager|Manager|Transformation|Transformation|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Manager|Mana

surgery|     PUNCT  7020495506229554396    surgery|
ed           PROPN  11179282889058780238   ed
                                       SPACE  15534150511174546965                                         
Voice        PROPN  13349562772341574791   Voice
                                                            SPACE  7113397204720723894                                                               
Chennai      PROPN  14227523447876151891   Chennai
,            PUNCT  2593208677638477497    ,
Hyderabad    PROPN  17631673017946934592   Hyderabad
,            PUNCT  2593208677638477497    ,
Trivandrum   PROPN  9307514498216942558    Trivandrum
             SPACE  6718839663412986256       
ITES         NOUN   15496362870397060443   ite
,            PUNCT  2593208677638477497    ,
BPO          PROPN  5770954012531429021    BPO
,            PUNCT  2593208677638477497    ,
KPO          PROPN  9061222202736366592    KPO
,            PUNCT  2593208677638477497    ,
LPO          PROPN  1110

Ahmedabad    PROPN  18369147185694290034   Ahmedabad
,            PUNCT  2593208677638477497    ,
Gandhinagar  PROPN  15216714595426338901   Gandhinagar
,            PUNCT  2593208677638477497    ,
Vadodara     PROPN  8321805970711895386    Vadodara
             SPACE  6718839663412986256       
ITES         PROPN  7704317096207962771    ITES
,            PUNCT  2593208677638477497    ,
BPO          PROPN  5770954012531429021    BPO
,            PUNCT  2593208677638477497    ,
KPO          PROPN  9061222202736366592    KPO
,            PUNCT  2593208677638477497    ,
LPO          PROPN  11100355490290156305   LPO
,            PUNCT  2593208677638477497    ,
Customer     PROPN  10853472433698617114   Customer
Service      PROPN  1069327236522697900    Service
,            PUNCT  2593208677638477497    ,
Operations   PROPN  18219273888067973519   Operations
                                   SPACE  852416194066011836                                       
BPO          PROPN  577095401253

Internet     PROPN  8006197966488148988    Internet
Technologies PROPN  9387358241513496499    Technologies
                           SPACE  7978323633892547218                              
IT           PROPN  15566906646452856019   IT
-            PUNCT  9153284864653046197    -
Software     PROPN  5476674789249212452    Software
,            PUNCT  2593208677638477497    ,
Software     PROPN  5476674789249212452    Software
Services     PROPN  16271719928879294026   Services
                                SPACE  17081475395910707606                                  
Technical    PROPN  17150435118023439842   Technical
Writer       PROPN  9869980309737935526    Writer

            SPACE  962983613142996970     

83           NUM    9343442437703156343    83
             SPACE  8532415787641010193     
942457ffdd881314cfc0e4b8f8c37e54 NUM    17201076646066289311   942457ffdd881314cfc0e4b8f8c37e54
             SPACE  8532415787641010193     
2019         NUM    1316061072807483091   

Generation|  PUNCT  1974362560598462591    Generation|
Accounting|  PUNCT  14915748471802212151   Accounting|
Banking|     PUNCT  16149860048224975592   Banking|
Revenue      NOUN   7201113671556944259    revenue
Generation|  PUNCT  1974362560598462591    Generation|
Current      PROPN  6237770954595417835    Current
Account      PROPN  3553856693880833784    Account
                                SPACE  17081475395910707606                                  
Retail       PROPN  11179050979078045360   Retail
Sales        PROPN  13265478253048405467   Sales
                                                                               SPACE  7140874055527140739                                                                                  
Delhi        PROPN  7055494911946032454    Delhi
NCR          PROPN  1333732137594039432    NCR
                     SPACE  7673489056978492252                        
Sales        PROPN  13265478253048405467   Sales
,            PUNCT  259320867763

In [11]:
#5# MAKE NER 
for chunk in doc.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)

                             Uniq I - I - ROOT - I
Crawl Timestamp - Timestamp - conj - d
Job Title - Title - ROOT - Title
Job Salary Job Experience - Experience - nsubj - Required
Key Skills - Skills - dobj - Required
Role Category - Category - conj - Skills
Location - Location - conj - Skills
Functional Area - Area - appos - Skills
Industry - Industry - appos - Skills
Role - Role - conj - Skills
2019-07-05 01:46:07 +0000 - +0000 - dative - 9be62c49a0b7ebe982a4af1edaa7bc5f
Digital Media Planner - Planner - appos - +0000
Recruiter - Recruiter - pobj - by
Digital Media - Media - ROOT - Media
Mumbai        Marketing - Marketing - ROOT - Marketing
Media Planning - Planning - appos - Marketing
Advertising - Advertising - conj - Planning
PR - PR - conj - Advertising
MR - MR - conj - Marketing
Event Management - Management - conj - MR
Media Planning Executive/Manager - Manager - ROOT - Manager
08:04:50 +0000 - +0000 - ROOT - +0000
Online Bidding Executive - Executive - ROOT - Executive
Recru

Digital Marketing - Marketing - dobj - Online
Gurgaon - Gurgaon - conj - Marketing
Media Planning - Planning - appos - Marketing
Advertising - Advertising - conj - Planning
PR - PR - conj - Advertising
MR - MR - conj - Marketing
Event Management - Management - conj - MR
Social Media Marketing Manager - Manager - appos - Marketing
2019-08-05 04:32:34 +0000 - +0000 - nsubj - Wanted
Engineering Fresher - Fresher - dobj - Wanted
MNC Chennai - Chennai - pobj - For
- 1 yrs                                                                                                                                                                                                                                                                                                                                                                                                                                                                             instrumentation - instrumentation - ROOT - instrumentation
Other - O

In [12]:
#6# Visualizing the entity recognizer
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

In [13]:
len([ent for ent in doc.ents ]) 

671

In [14]:
#####################################

# start work with TM:
    # second methodolgy by using NMF

In [15]:
# Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [17]:
dtm = tfidf.fit_transform(job4)

In [18]:
dtm

<4355x420 sparse matrix of type '<class 'numpy.float64'>'
	with 3544 stored elements in Compressed Sparse Row format>

In [19]:
# NMF Non-Negative Matric Factorization
from sklearn.decomposition import NMF

In [20]:
nmf_model = NMF(n_components=10,random_state=42)

In [21]:
# This can take awhile, we're dealing with a large amount of documents!
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [22]:
# Displaying Topics
len(tfidf.get_feature_names())

420

In [23]:
import random

In [24]:
for i in range(10):
    random_word_id = random.randint(0,420)
    print(tfidf.get_feature_names()[random_word_id])

office
uk
research
area
surgery
location
content
teaching
recruitment
audit


In [25]:
nmf_model.components_

array([[2.21528023e-08, 2.66534598e-07, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.13106549e-13, 6.27205004e-13, 3.16227766e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.48880979e-12, 1.62126894e-12, 0.00000000e+00, ...,
        0.00000000e+00, 3.12198564e+00, 0.00000000e+00],
       ...,
       [1.27092822e-08, 8.84713815e-09, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.78787777e-08, 6.65157673e-08, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.44466964e-08, 5.13571450e-08, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [26]:
single_topic = nmf_model.components_[0]

# Returns the indices that would sort this array.
single_topic.argsort()

array([209, 282, 281, 280, 279, 278, 277, 276, 275, 274, 272, 271, 270,
       269, 267, 266, 265, 264, 263, 262, 261, 260, 283, 284, 285, 286,
       310, 309, 308, 307, 306, 305, 304, 303, 302, 299, 259, 298, 296,
       295, 294, 293, 292, 291, 290, 289, 288, 287, 297, 258, 257, 256,
       229, 228, 227, 226, 225, 224, 223, 222, 221, 220, 230, 219, 217,
       216, 215, 214, 213, 212, 211, 210, 418, 208, 218, 311, 231, 233,
       255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 232, 245, 243,
       242, 241, 240, 239, 238, 237, 236, 235, 234, 244, 207, 312, 314,
       390, 389, 388, 387, 386, 385, 384, 383, 382, 381, 380, 379, 378,
       377, 376, 375, 374, 372, 371, 370, 369, 392, 393, 394, 395, 417,
       416, 415, 414, 413, 412, 411, 410, 409, 408, 368, 407, 405, 404,
       403, 402, 401, 400, 399, 398, 397, 396, 406, 367, 366, 365, 337,
       336, 335, 334, 333, 332, 331, 330, 329, 328, 338, 327, 325, 324,
       323, 322, 321, 320, 319, 318, 317, 316, 326, 313, 339, 34

In [47]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([ 13,  25,  97,  60,  20,  17, 187,  34, 268, 373], dtype=int64)

In [48]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['17', '02', '03', '50', '55', '28', '45', '23', '39', '43', '11', '22', 'associate', '56', '18', '15', 'executive', '31', 'manager', 'software']


THE TOP 15 WORDS FOR TOPIC #1
['31', 'business', '56', '28', '23', '45', '39', '43', '18', '11', '22', '15', 'bpo', 'associate', 'developer', 'maintenance', 'executive', 'application', 'manager', '0000']


THE TOP 15 WORDS FOR TOPIC #2
['customer', '31', 'bpo', '56', '28', '23', '45', '39', '43', '18', '11', '22', '15', 'associate', 'developer', 'maintenance', 'executive', 'application', 'manager', 'yrs']


THE TOP 15 WORDS FOR TOPIC #3
['07', 'consulting', '20', '19', '15', '02', '10', '16', '01', '43', 'planning', '52', 'technologies', 'web', 'strategy', 'aws', 'development', '46', 'business', 'sales']


THE TOP 15 WORDS FOR TOPIC #4
['47', '35', '52', '29', '26', '33', '48', '75', 'lead', '19', '25', '20', '50', '00', 'technologies', '000', 'web', 'developer', 'business', 'disclosed']


THE TOP 15 WORDS FOR 