<a href="https://colab.research.google.com/github/kuzhuppillil/nU8E2quXIo33gksu/blob/main/Potential_talents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [151]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [146]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

tiv = TfidfVectorizer(ngram_range=(1,3),max_features = 10)
cv = CountVectorizer(ngram_range=(1,3))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [127]:
data = pd.read_csv("/content/drive/Othercomputers/My Laptop/Apziva Projects/Potential Talents/potential-talents - Aspiring human resources - seeking human resources.csv")
data.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [128]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB


In [129]:
len(data) - len(data[["job_title","location","connection"]].drop_duplicates())

51

Observation:



*   'fit' column is the target
*   Columns 'job_title' and 'location' are the main text columns, where NLP techniques need to be applied.
*   'connection' column contains the numerics with exepection of some '+' symbols.
* There are lot of duplicates in the data, these duplicates has the same jobtites, location and connection.
* The locations seem to be in different format and contains some spelling mistakes.





# Text Preprocessing:

In [130]:
#create a copy of data without the duplicates and with orignials indexes
data1 = data.drop_duplicates(subset=["job_title","location","connection"]).copy()

In [131]:
#removed the +symbol from the connection and changed the dtype to int
data1["connection"] = data1["connection"].str.replace("+","",regex = True).astype(int)

In [132]:
#contents in location column
data1.location.unique()

array(['Houston, Texas', 'Kanada', 'Raleigh-Durham, North Carolina Area',
       'Denton, Texas', 'İzmir, Türkiye', 'Greater New York City Area',
       'San Francisco Bay Area', 'Greater Philadelphia Area',
       'Lake Forest, California', 'Houston, Texas Area',
       'Atlanta, Georgia', 'Chicago, Illinois', 'Austin, Texas Area',
       'Jackson, Mississippi Area', 'Greater Grand Rapids, Michigan Area',
       'Virginia Beach, Virginia', 'Monroe, Louisiana Area',
       'Greater Boston Area', 'San Jose, California',
       'New York, New York', 'Dallas/Fort Worth Area',
       'Amerika Birleşik Devletleri', 'Baton Rouge, Louisiana Area',
       'Myrtle Beach, South Carolina Area', 'Chattanooga, Tennessee Area',
       'Los Angeles, California', 'Highland, California',
       'Gaithersburg, Maryland', 'Baltimore, Maryland',
       'Milpitas, California', 'Greater Atlanta Area',
       'Greater Chicago Area', 'Torrance, California',
       'Long Beach, California', 'Bridgewater, Massa

In [133]:

#fucntion to clean the location text
def cleaning(location):

  #contains unnecessary words like 'Area', 'City' and 'Greater, removing them
  location= re.sub("Area","",location)
  location= re.sub("Greater","",location)
  location= re.sub("City","",location)

  #fixing spelling and language check
  location = re.sub("Kanada","Canada",location)
  location = re.sub("Amerika Birleşik Devletleri","United States of America",location)
  location = re.sub("İzmir, Türkiye","izmir, Turkey",location)

  #fixing format
  location = re.sub("Dallas/Fort Worth","Dallas, Fort Worth",location)
  location = re.sub("New York, New York","New York",location)

  #fixing whitespace at start and end of names
  location = re.sub("^\s+|\s+$","",location)

  location = location.lower()

  return location

data1.location = data1.location.apply(lambda x: cleaning(x))
data1.location.unique()

array(['houston, texas', 'canada', 'raleigh-durham, north carolina',
       'denton, texas', 'izmir, turkey', 'new york', 'san francisco bay',
       'philadelphia', 'lake forest, california', 'atlanta, georgia',
       'chicago, illinois', 'austin, texas', 'jackson, mississippi',
       'grand rapids, michigan', 'virginia beach, virginia',
       'monroe, louisiana', 'boston', 'san jose, california',
       'dallas, fort worth', 'united states of america',
       'baton rouge, louisiana', 'myrtle beach, south carolina',
       'chattanooga, tennessee', 'los angeles, california',
       'highland, california', 'gaithersburg, maryland',
       'baltimore, maryland', 'milpitas, california', 'atlanta',
       'chicago', 'torrance, california', 'long beach, california',
       'bridgewater, massachusetts', 'lafayette, indiana',
       'kokomo, indiana', 'las vegas, nevada', 'cape girardeau, missouri',
       'los angeles', 'katy, texas'], dtype=object)

In [145]:
#function to clean the text body, tokenization and lemmatization

def process(text):

  #Filtering out the non-alphabets
  text = re.sub(r"[^a-zA-z\'\s]","",text)

  #converted all text to lowercase for consistency
  text = text.lower()

  cleaned_sentences = []
  words = nltk.word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
  #stemm_words = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
  cleaned_text = ' '.join(lemmatized_words)

  return cleaned_text


data1["job_title"] = data1.job_title.apply(lambda x: process(x))



In [144]:
#Frequency count of words

def freq(text):
  word_list = []

  for word in text.str.split():
    word_list.extend(word)
  freq_word = pd.Series(word_list).value_counts()

  return freq_word

word_freq = freq(data1.job_title)


word_freq[:10]


human           34
resource        34
aspiring        13
professional    10
seeking         10
manager          7
student          6
university       6
management       5
generalist       5
dtype: int64

In [148]:
#Bag Of Words

X=cv.fit_transform(data1.job_title)
cv.vocabulary_


{'ct': 101,
 'bauer': 36,
 'college': 76,
 'business': 54,
 'graduate': 176,
 'magna': 283,
 'cum': 104,
 'laude': 262,
 'aspiring': 26,
 'human': 205,
 'resource': 407,
 'professional': 383,
 'ct bauer': 102,
 'bauer college': 37,
 'college business': 79,
 'business graduate': 55,
 'graduate magna': 177,
 'magna cum': 284,
 'cum laude': 105,
 'laude aspiring': 263,
 'aspiring human': 27,
 'human resource': 206,
 'resource professional': 436,
 'ct bauer college': 103,
 'bauer college business': 38,
 'college business graduate': 80,
 'business graduate magna': 56,
 'graduate magna cum': 178,
 'magna cum laude': 285,
 'cum laude aspiring': 106,
 'laude aspiring human': 264,
 'aspiring human resource': 28,
 'human resource professional': 220,
 'native': 327,
 'english': 144,
 'teacher': 519,
 'epik': 154,
 'program': 390,
 'korea': 260,
 'native english': 328,
 'english teacher': 147,
 'teacher epik': 520,
 'epik english': 155,
 'english program': 145,
 'program korea': 391,
 'native engl

In [153]:
#TF-IDF


keywords = ["aspiring human resources", "seeking human resources"]

X = tiv.fit_transform(data1.job_title)
keyword_vectors = tiv.transform(keywords)

similarity_scores = cosine_similarity(keyword_vectors, X)
max_similarity_scores = similarity_scores.max(axis=0)

data1['fit'] = max_similarity_scores

data1 = data1.sort_values(by='fit', ascending=False)

data1.head(5)

Unnamed: 0,id,job_title,location,connection,fit
91,92,seeking employment opportunity within customer...,"torrance, california",64,0.86653
27,28,seeking human resource opportunity,"chicago, illinois",390,0.816973
69,70,retired army national guard recruiter office m...,"virginia beach, virginia",82,0.816973
98,99,seeking human resource position,"las vegas, nevada",48,0.816973
93,94,seeking human resource opportunity open travel...,united states of america,415,0.816973
