In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer as ps
from nltk import WordNetLemmatizer as wn

from nltk.tokenize import sent_tokenize , word_tokenize
import string
import re



%matplotlib inline

In [117]:
data = pd.read_csv('jobs_data.csv', index_col = 0)
data = data.drop(['industry'], axis = 1) # removing useless features 
# data = data.drop(['Unnamed: 0', 'industry'], axis = 1) # removing useless features 
data.head()

Unnamed: 0,title,jobFunction
0,Full Stack PHP Developer,"['Engineering - Telecom/Technology', 'IT/Softw..."
1,CISCO Collaboration Specialist Engineer,"['Installation/Maintenance/Repair', 'IT/Softwa..."
2,Senior Back End-PHP Developer,"['Engineering - Telecom/Technology', 'IT/Softw..."
3,UX Designer,"['Creative/Design/Art', 'IT/Software Developme..."
4,Java Technical Lead,"['Engineering - Telecom/Technology', 'IT/Softw..."


In [118]:
data.shape

(10870, 2)

In [119]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()


In [120]:
def clean_text(text):
    text = re.sub("[^A-Za-z]", " ", text.strip())                              # Remove non english words
    text = "".join([word for word in text if word not in string.punctuation]) # Removing puncituation
    tokenize = re.split('\W+' , text)                                        # Tokenization    
    text = [word.lower() for word in tokenize if word not in stopwords]     # Removing stopwords , lower case
    text = [word for word in text if len(word)]                            # Remove the empty string
#     text = [ps.stem(word) for word in text]                             # Stemmnig words
    text = " ".join(word for word in text)                             # Make it as a sentence 
    
    return text

def clean_text_1(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])  # Remove Punctuation
    tokens = re.split('\W+',text)                                                     # Tokenize 
    text = [ps.stem(word) for word in tokens if word not in stopwords]               # Remove stopwordsand stem
    return text


In [121]:
data['clean_title'] = data['title'].apply(lambda x : clean_text(x))
data['clean_jobFunction'] = data['jobFunction'].apply(lambda x : clean_text(x))

# delete the old cols
data = data.drop(['title', 'jobFunction'], axis=1)
data.head(10)

Unnamed: 0,clean_title,clean_jobFunction
0,full stack php developer,engineering telecom technology it software dev...
1,cisco collaboration specialist engineer,installation maintenance repair it software de...
2,senior back end php developer,engineering telecom technology it software dev...
3,ux designer,creative design art it software development
4,java technical lead,engineering telecom technology it software dev...
5,technical support engineer,it software development engineering telecom te...
6,senior ios developer,engineering telecom technology it software dev...
7,mechanical engineer,engineering mechanical electrical
8,real estate sales specialist th ramadan,sales retail
9,school principal,education teaching administration operations m...


In [122]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text_1)
x_counts = count_vect.fit_transform(data['clean_title'])
print(x_counts.shape) # (text_message , unique_word) , (row,col)
print(count_vect.get_feature_names())

(10870, 1128)
['', 'abap', 'abb', 'abbassia', 'abc', 'abl', 'abu', 'access', 'account', 'acquisit', 'act', 'activ', 'ad', 'addicta', 'adf', 'adjoint', 'admin', 'administr', 'admiss', 'admixtur', 'adult', 'advertis', 'advis', 'advisor', 'adword', 'af', 'affair', 'affili', 'afgr', 'agenc', 'agent', 'agil', 'agouza', 'agricultur', 'ai', 'air', 'ajman', 'al', 'alamein', 'alarm', 'alex', 'alexandria', 'ali', 'alsadat', 'aluminum', 'amc', 'american', 'analysi', 'analyst', 'analyt', 'android', 'angular', 'angularj', 'anim', 'ap', 'apex', 'api', 'app', 'applic', 'approv', 'ar', 'arab', 'arabia', 'architect', 'architectur', 'archiv', 'arduino', 'area', 'armor', 'art', 'artifici', 'artist', 'ash', 'asp', 'assess', 'assist', 'assiut', 'associ', 'assur', 'asu', 'aswan', 'atm', 'audienc', 'audit', 'auditor', 'autocad', 'autom', 'automot', 'avaya', 'aviat', 'ax', 'azhar', 'azur', 'b', 'back', 'backend', 'background', 'baker', 'balanc', 'banha', 'bank', 'base', 'basic', 'bathtub', 'bd', 'beauti', 'be

## Applying counterVectorizer in smaller sample


In [123]:
# data_sample = data[0:20]
# data_sample
# count_vect_sample  = CountVectorizer(analyzer = clean_text_1)
# x_count_sample = count_vect_sample.fit_transform(data_sample['clean_title'])
# print(x_count_sample.shape)
# print(count_vect_sample.get_feature_names())


In [124]:
# x_count_sample


In [125]:
# x_count_sample.toarray()


In [126]:
# x_count_sample_df = pd.DataFrame(x_count_sample.toarray())

# x_count_sample_df.columns = count_vect_sample.get_feature_names()
# x_count_sample_df

## Applying counterVectorizer all data


In [127]:
all_data= data
count_vect_all_data  = CountVectorizer(analyzer = clean_text_1)
x_count_all_data = count_vect_all_data.fit_transform(all_data['clean_title'])
print(x_count_all_data.shape)
print(count_vect_all_data.get_feature_names())



(10870, 1128)
['', 'abap', 'abb', 'abbassia', 'abc', 'abl', 'abu', 'access', 'account', 'acquisit', 'act', 'activ', 'ad', 'addicta', 'adf', 'adjoint', 'admin', 'administr', 'admiss', 'admixtur', 'adult', 'advertis', 'advis', 'advisor', 'adword', 'af', 'affair', 'affili', 'afgr', 'agenc', 'agent', 'agil', 'agouza', 'agricultur', 'ai', 'air', 'ajman', 'al', 'alamein', 'alarm', 'alex', 'alexandria', 'ali', 'alsadat', 'aluminum', 'amc', 'american', 'analysi', 'analyst', 'analyt', 'android', 'angular', 'angularj', 'anim', 'ap', 'apex', 'api', 'app', 'applic', 'approv', 'ar', 'arab', 'arabia', 'architect', 'architectur', 'archiv', 'arduino', 'area', 'armor', 'art', 'artifici', 'artist', 'ash', 'asp', 'assess', 'assist', 'assiut', 'associ', 'assur', 'asu', 'aswan', 'atm', 'audienc', 'audit', 'auditor', 'autocad', 'autom', 'automot', 'avaya', 'aviat', 'ax', 'azhar', 'azur', 'b', 'back', 'backend', 'background', 'baker', 'balanc', 'banha', 'bank', 'base', 'basic', 'bathtub', 'bd', 'beauti', 'be

In [128]:
x_count_all_data.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [129]:
x_count_all_data_df = pd.DataFrame(x_count_all_data.toarray())

x_count_all_data_df.columns = count_vect_all_data.get_feature_names()
x_count_all_data_df

Unnamed: 0,Unnamed: 1,abap,abb,abbassia,abc,abl,abu,access,account,acquisit,...,writer,xamarin,yard,year,yii,young,youtub,yt,zagazig,zay
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Apply TF-IDF Vctorizer


In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text_1)
x_tfidf = tfidf_vect.fit_transform(data['clean_title'])
print(x_tfidf.shape) # (text_message , unique_word) , (row,col)
print(tfidf_vect.get_feature_names())


(10870, 1128)
['', 'abap', 'abb', 'abbassia', 'abc', 'abl', 'abu', 'access', 'account', 'acquisit', 'act', 'activ', 'ad', 'addicta', 'adf', 'adjoint', 'admin', 'administr', 'admiss', 'admixtur', 'adult', 'advertis', 'advis', 'advisor', 'adword', 'af', 'affair', 'affili', 'afgr', 'agenc', 'agent', 'agil', 'agouza', 'agricultur', 'ai', 'air', 'ajman', 'al', 'alamein', 'alarm', 'alex', 'alexandria', 'ali', 'alsadat', 'aluminum', 'amc', 'american', 'analysi', 'analyst', 'analyt', 'android', 'angular', 'angularj', 'anim', 'ap', 'apex', 'api', 'app', 'applic', 'approv', 'ar', 'arab', 'arabia', 'architect', 'architectur', 'archiv', 'arduino', 'area', 'armor', 'art', 'artifici', 'artist', 'ash', 'asp', 'assess', 'assist', 'assiut', 'associ', 'assur', 'asu', 'aswan', 'atm', 'audienc', 'audit', 'auditor', 'autocad', 'autom', 'automot', 'avaya', 'aviat', 'ax', 'azhar', 'azur', 'b', 'back', 'backend', 'background', 'baker', 'balanc', 'banha', 'bank', 'base', 'basic', 'bathtub', 'bd', 'beauti', 'be

## Apply TF-IDF Vctorizer on smaller sample

In [131]:
# data_sample = data[0:20]

# tfidf_vect_sample = TfidfVectorizer(analyzer = clean_text_1)
# x_tfidf_sample = tfidf_vect_sample.fit_transform(data_sample['clean_title'])
# print(x_tfidf_sample.shape)
# print(tfidf_vect_sample.get_feature_names())


In [132]:
# x_tfidf_df = pd.DataFrame(x_tfidf_sample.toarray())
# x_tfidf_df.columns = tfidf_vect_sample.get_feature_names()
# x_tfidf_df


## Applying TF-IDF in all  data

In [133]:
all_data_tfidf = data
tfidf_vect_all_data = TfidfVectorizer(analyzer = clean_text_1)
x_tfidf_all_data = tfidf_vect_all_data.fit_transform(all_data_tfidf['clean_title'])
print(x_tfidf_all_data.shape)
print(tfidf_vect_all_data.get_feature_names())

(10870, 1128)
['', 'abap', 'abb', 'abbassia', 'abc', 'abl', 'abu', 'access', 'account', 'acquisit', 'act', 'activ', 'ad', 'addicta', 'adf', 'adjoint', 'admin', 'administr', 'admiss', 'admixtur', 'adult', 'advertis', 'advis', 'advisor', 'adword', 'af', 'affair', 'affili', 'afgr', 'agenc', 'agent', 'agil', 'agouza', 'agricultur', 'ai', 'air', 'ajman', 'al', 'alamein', 'alarm', 'alex', 'alexandria', 'ali', 'alsadat', 'aluminum', 'amc', 'american', 'analysi', 'analyst', 'analyt', 'android', 'angular', 'angularj', 'anim', 'ap', 'apex', 'api', 'app', 'applic', 'approv', 'ar', 'arab', 'arabia', 'architect', 'architectur', 'archiv', 'arduino', 'area', 'armor', 'art', 'artifici', 'artist', 'ash', 'asp', 'assess', 'assist', 'assiut', 'associ', 'assur', 'asu', 'aswan', 'atm', 'audienc', 'audit', 'auditor', 'autocad', 'autom', 'automot', 'avaya', 'aviat', 'ax', 'azhar', 'azur', 'b', 'back', 'backend', 'background', 'baker', 'balanc', 'banha', 'bank', 'base', 'basic', 'bathtub', 'bd', 'beauti', 'be

In [134]:
x_tfidf_all_data_df = pd.DataFrame(x_tfidf_all_data.toarray())
x_tfidf_all_data_df.columns = tfidf_vect_all_data.get_feature_names()
x_tfidf_all_data_df


Unnamed: 0,Unnamed: 1,abap,abb,abbassia,abc,abl,abu,access,account,acquisit,...,writer,xamarin,yard,year,yii,young,youtub,yt,zagazig,zay
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cosine Similarity
* I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. Mathematically, it is defined as follows:

* cosine(x,y)=x.y⊺||x||.||y|| 
* Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's linear_kernel instead of cosine_similarities since it is much faster.



In [167]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(x_tfidf_all_data_df, x_tfidf_all_data_df)
cosine_sim[0]

array([1.        , 0.        , 0.39014129, ..., 0.        , 0.        ,
       0.11364868])

In [170]:
cosine_sim.shape

(10870, 10870)

## Train the tf-idf -Query


In [171]:

# test_set = "Machine Learning Engineer".split(' ')  # Query
# test_set
# testVectorizerArray = vectorizer.fit_transform(test_set).toarray()
# print(testVectorizerArray)
# cosine_similarities = linear_kernel(test_set, x_tfidf_all_data_df).flatten()
# cosine_similarities

In [172]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from nltk.corpus import stopwords
# import numpy as np
# import numpy.linalg as LA

# train_set = ["The sky is blue.", "The sun is bright."]  # Documents
# test_set = ["The sun in the sky is bright."]  # Query

# vectorizer = CountVectorizer(analyzer = clean_text_1)
# #print vectorizer
# transformer = TfidfTransformer(analyzer = clean_text_1)
# #print transformer

# # trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
# testVectorizerArray = vectorizer.transform(test_set).toarray()
# # print('Fit Vectorizer to train set', trainVectorizerArray)
# print('Transform Vectorizer to test set', testVectorizerArray)

# # transformer.fit(trainVectorizerArray)
# # print(transformer.transform(trainVectorizerArray).toarray())

# transformer.fit(testVectorizerArray)
# tfidf = transformer.transform(testVectorizerArray)
# print(tfidf.todense())

In [173]:
# test_set = ["Machine Learning Engineer"]  # Query

# count_test_query  = CountVectorizer(analyzer = clean_text_1)
# tfidf_test_query = TfidfVectorizer(analyzer = clean_text_1)

# testVectorizerArray = count_test_query.fit_transform(test_set).toarray()
# print('Transform Vectorizer to test set', testVectorizerArray)

# tfidf = tfidf_test_query.fit_transform(testVectorizerArray)

In [174]:
st = "Hello my freind i'm Machine Learning Engineer!".split(" ")
st
# print(clean_text_1(st))

['Hello', 'my', 'freind', "i'm", 'Machine', 'Learning', 'Engineer!']