In [None]:
# Importing the necessary modules.

import pandas as pd
import tensorflow as tf
tf.__version__
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Importing the labeled dataset.
Labelled_data=pd.read_csv('data_to_label.csv')

Labelled_data.head()
Labelled_data=Labelled_data.dropna()

# Get the Independent Features
X=Labelled_data.drop('Skill',axis=1)

# Get the Dependent features
y=Labelled_data['Skill']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Setting the Vocabulary size
voc_size=100000
PotentialWords=X.copy()
PotentialWords.reset_index(inplace=True)

In [None]:
# Building the corpus after removing stop words in each sentence in the job description.

ps = PorterStemmer()
corpus = []
for i in range(len(PotentialWords)):
    PotentialWord = re.sub('[^a-zA-Z]', ' ', PotentialWords['Word'][i])
    PotentialWord = PotentialWord.lower()
    PotentialWord = PotentialWord.split()
    
    PotentialWord = [ps.stem(word) for word in PotentialWord if not word in stopwords.words('english')]
    PotentialWord = ' '.join(PotentialWord)
    corpus.append(PotentialWord)

print(len(corpus))
print(corpus)

1674
['look', 'selenium', 'engin', '', 'must', 'solid', 'java', 'code', 'skill', 'sever', 'open', 'month', 'hire', '', 'must', 'abl', 'go', 'perm', 'someon', '', 'year', 'experi', '', 'import', 'qualiti', 'eager', 'aptitud', '', 'posit', 'purpos', '', 'want', 'forefront', 'cut', 'edg', 'technolog', '', 'introduc', 'solut', 'problem', '', 'exist', '', 'abil', 'see', 'result', 'success', '', 'client', 'assur', 'growth', 'collabor', 'develop', 'team', 'creat', 'tool', 'aid', 'engin', 'build', '', 'test', '', 'debug', '', 'releas', 'softwar', '', 'touch', 'million', 'user', 'increas', 'rate', 'develop', 'ensur', 'product', 'method', 'test', '', 'expert', 'softwar', 'health', '', 'testabl', '', 'sustain', '', 'softwar', 'engin', 'test', 'client', '', 'expect', 'build', 'flexibl', 'scalabl', 'solut', 'work', 'complex', 'challeng', 'larg scale', 'comput', 'util', 'skill', 'data', 'structur', 'object', 'orient', 'program', '', 'major', 'task', '', 'respons', 'key', 'account', '', 'lead', 'cont

In [None]:
# Generating one-hot representations for each word in the corpus.

onehot_repr=[one_hot(words,voc_size)for words in corpus] 
print(len(onehot_repr))

1674


In [None]:
# Creating embedding docs with a vector of 5 dimensions to be passed to the LSTM. 

words_length=5
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=words_length)
print(len(embedded_docs))

1674


In [None]:
# Creating model
embedding_vector_features=10
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=words_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print(model.summary())
print(len(embedded_docs),y.shape)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 5, 10)             1000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 1,044,501
Trainable params: 1,044,501
Non-trainable params: 0
_________________________________________________________________
None
1674 (1674,)


In [None]:
# Creating training and testing data.

X_total=np.array(embedded_docs)
y_total=np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X_total, y_total, test_size=0.20, random_state=40)

# Training the model
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=50,batch_size=64)

# Making predictions.
y_pred=model.predict_classes(X_test)

confusion_matrix(y_test,y_pred)
accuracy_score(y_test,y_pred)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




0.8208955223880597

In [77]:
# Importing necessary modules.

import pandas as pd 
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

# Shortlisting the columns to be used for building the skill set.

columns = ['LanguageWorkedWith','LanguageDesireNextYear','DatabaseWorkedWith','DatabaseDesireNextYear',
           'PlatformWorkedWith','PlatformDesireNextYear','FrameworkWorkedWith','FrameworkDesireNextYear',
           'IDE','OperatingSystem','VersionControl']
Skills = pd.read_csv(filepath_or_buffer='survey_results_public_skills.csv',usecols=columns)
Skills = Skills.dropna()
Skills['Combined_Skills'] = Skills[Skills.columns[1:]].apply(lambda x: ';'.join(x.dropna().astype(str)),axis=1)
Skills['Skill_List'] = Skills['Combined_Skills'].str.split(";")
Unique_Skills = set()
print(Skills)

# Adding all unique skills in a set.

for i in Skills["Skill_List"]:
  for j in i:
      Unique_Skills.add(j.lower())

print(len(Unique_Skills))  
print(Unique_Skills) 

# Filtering the rows with no skills.

Jobs = pd.read_csv(filepath_or_buffer='jobs.csv')
print(Jobs)

JobDescriptions = pd.read_csv(filepath_or_buffer='jobs.csv')
JobDescriptions = JobDescriptions.loc[JobDescriptions['skills'].str.contains('See',na=False,case=False)]
JobDescriptions = JobDescriptions.loc[JobDescriptions['skills'].str.count(',')==0]
JobDescriptions = JobDescriptions.reset_index(drop=True)
print (JobDescriptions)


# Predicting the skills with the help of jobdescription column by using the model trained above.

for i in range(len(JobDescriptions)):
  JobDescriptions.loc[i,"skills"] = " "

  # Preprocessing the data
  JobDescriptions.loc[i,"jobdescription"] = re.sub(r'\[[0-9]*\]',' ',str(JobDescriptions.loc[i,"jobdescription"]))
  JobDescriptions.loc[i,"jobdescription"] = re.sub(r'\s+',' ',JobDescriptions.loc[i,"jobdescription"])
  JobDescriptions.loc[i,"jobdescription"] = JobDescriptions.loc[i,"jobdescription"].lower()
  JobDescriptions.loc[i,"jobdescription"] = re.sub(r'\d',' ',JobDescriptions.loc[i,"jobdescription"])
  JobDescriptions.loc[i,"jobdescription"] = re.sub(r'\s+',' ',JobDescriptions.loc[i,"jobdescription"])
  
  # Breaking down job description into sentences.
  sentences = nltk.sent_tokenize(JobDescriptions.loc[i,"jobdescription"])
 
  sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

  for j in range(len(sentences)):
     sentences[j] = [word for word in sentences[j] if word not in stopwords.words('english')]
    
     for k in sentences[j]:
       if k in Unique_Skills and k not in JobDescriptions.loc[i,"skills"] :
         JobDescriptions.loc[i,"skills"] += k
         JobDescriptions.loc[i,"skills"] += "," 
       elif k not in Unique_Skills:
         onehot_repr=one_hot(k,voc_size) 
         embedded_docs=pad_sequences([onehot_repr],padding='pre',maxlen=words_length)
         if model.predict_classes(embedded_docs)[0][0]==1:
           JobDescriptions.loc[i,"skills"] += k
           JobDescriptions.loc[i,"skills"] += ","  

JobDescriptions["skills"] = JobDescriptions["skills"].str.rstrip(',')
print(JobDescriptions)

Jobs = pd.concat([Jobs, JobDescriptions]).reset_index(drop=True)
Jobs = Jobs.drop_duplicates(subset=['uniq_id'], keep='last')
Jobs = Jobs.reset_index(drop=True)
print(Jobs)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
                               LanguageWorkedWith  ...                                         Skill_List
0                      JavaScript;Python;HTML;CSS  ...  [JavaScript, Python, HTML, CSS, Redis, SQL Ser...
1                    JavaScript;Python;Bash/Shell  ...  [Go, Python, Redis, PostgreSQL, Memcached, Pos...
5      Java;JavaScript;Python;TypeScript;HTML;CSS  ...  [C#, Go, Java, JavaScript, Python, SQL, TypeSc...
13                                           Java  ...  [Java, Python, MongoDB, MySQL, Oracle, MariaDB...
17                     C#;SQL;HTML;CSS;Bash/Shell  ...  [C#, F#, Haskell, SQL, Ocaml, SQL Server, Redi...
...                                           ...  ...                                                ...
92430                     PH



                                         advertiserurl  ...                           uniq_id
0    https://www.dice.com/jobs/detail/AUTOMATION-TE...  ...  418ff92580b270ef4e7c14f0ddfc36b4
1    https://www.dice.com/jobs/detail/Java-Develope...  ...  3941b2f206ae0f900c4fba4ac0b18719
2    https://www.dice.com/jobs/detail/Application-S...  ...  95c9127e2770172f454f3b83981eaa88
3    https://www.dice.com/jobs/detail/Windows-Syste...  ...  9e5704d08bc07ddb6df9ef98b223b036
4    https://www.dice.com/jobs/detail/Java-Architec...  ...  e4f57bc5366124a0a47cac27f557f9ec
..                                                 ...  ...                               ...
188  https://www.dice.com/jobs/detail/JR-Project-ma...  ...  72597cc210e6c518f05dc2cfdd6fd17f
189  https://www.dice.com/jobs/detail/PC-Support-As...  ...  38e97bc0955ee08d7fff35bf9d2c7b6b
190  https://www.dice.com/jobs/detail/Technical-Lea...  ...  ca3b2c097deffef3bef71e8e8dd7b567
191  https://www.dice.com/jobs/detail/Applications-...  ... 

In [79]:
Jobs.to_csv('jobs_modified.csv')