In [58]:
import pandas as pd
import glob
import docx2txt

In [59]:
## First step read the different csv files that we extract from Indeed
def path(x):
    all_files = glob.glob(x + "/*.csv")
    return all_files

## Insert the path of the folder that contains different csv files.
all_files = path(r"C:\Users\nikos\Desktop\dataframes")

In [60]:
def create_df(files, li):
    for filename in files:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)
        df = pd.concat(li, axis=0, ignore_index=True)
    return df
df = create_df(all_files, [])
df.shape

(182, 7)

In [61]:
## Remove duplicates
def Shape(df):
    shape = [df.shape]
    if ([x[0] for x in shape][0]) > len(df["Job Url"].unique()):
        print("Droped Duplicates")
        df = df.drop_duplicates(subset=['Job Url'])
        return df
    else:
        print("We don't have Duplicates")
        return df

new_df = Shape(df)
new_df.shape

Droped Duplicates


(171, 7)

In [62]:
## reading the canditate's cv
def read_cv(name):
    # read the word file
    x =  docx2txt.process(name)
    return x
    
cv = read_cv("monster-cv-template-admin-assistant.docx") 

In [63]:
## Extract candidate email, and applying some basic text preprocessing 
def text_pre(x):
    ## Applying some text preprocessing
    text = x.replace("\n", "")
    return text
def extract_email(x):
    ## Extracting the email address using regex  
    import re
    match = re.search(r'[\w\.-]+@[\w\.-]+', x)
    email = match.group(0)
    return email
    
text = text_pre(cv)
email = extract_email(text)

In [64]:
## So, now we can add these two attributes as a row to our main dataframe
def insert_row(email, text, df):
    new_row = {'Job Url':email, 'Desc': text}
    df= df.append(new_row, ignore_index=True)
    return df

new_df = insert_row(email, text, new_df)
new_df.tail()

Unnamed: 0,Job Title,Job Url,Company,Location,Summary,Posting Date,Desc
167,Order Picker Warehouse Operative,http://www.indeed.com/rc/clk?jk=8b7dd86955a721...,Premier Work Support,London,To be successful in this role you need warehou...,23 days ago,We have an exciting opportunity to join a very...
168,Warehouse Operative - Immediate Start,http://www.indeed.com/pagead/clk?mo=r&ad=-6NYl...,ABC Depot,Finchley Central Station,Must have 3-5 years Builders Merchant experien...,30+ days ago,DutiesMust have 3-5 years Builders Merchant ex...
169,Warehouse Operative,http://www.indeed.com/company/White-Van-Gentle...,White Van Gentlemen,Earlsfield,Organizing / keeping tidy the warehouse.\nPall...,30+ days ago,Job DescriptionWhite Van Gentlemen is a white ...
170,Warehouse Operative - Picker / Packer / Replen,http://www.indeed.com/company/All-Pet-Solution...,All Pet Solutions,Uxbridge,Working within a team of warehouse operatives....,3 days ago,All Pet Solutions is an online market leader a...
171,,name@hotmail.com,,,,,"Uschi BarkerAddress: Flat 0, Any Road, Any Tow..."


### Perfect the new candidate is into our main dataframe, So now we can start developing the recommendation  model.

In [67]:
## Applying text preprocessing modules to Desc column
## text preprocessing 
import nltk 

nltk.download('punkt') 

nltk.download('averaged_perceptron_tagger') 

nltk.download('wordnet') 

    
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 

  

from nltk.corpus import stopwords 

nltk.download('stopwords') 

stop_words = set(stopwords.words('english')) 

  

VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikos\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
def preprocess_sentences(text): 

    text = text.lower() 
    temp_sent =[] 
    words = nltk.word_tokenize(text) 
    tags = nltk.pos_tag(words) 
    
    for i, word in enumerate(words): 
        if tags[i][1] in VERB_CODES:   
            lemmatized = lemmatizer.lemmatize(word, 'v') 
        else: 
            lemmatized = lemmatizer.lemmatize(word) 
        if lemmatized not in stop_words and lemmatized.isalpha(): 
            temp_sent.append(lemmatized) 

    finalsent = ' '.join(temp_sent) 
    finalsent = finalsent.replace("n't", " not") 
    finalsent = finalsent.replace("'m", " am") 
    finalsent = finalsent.replace("'s", " is") 
    finalsent = finalsent.replace("'re", " are") 
    finalsent = finalsent.replace("'ll", " will") 
    finalsent = finalsent.replace("'ve", " have") 
    finalsent = finalsent.replace("'d", " would") 

    return finalsent 


## Creating a new column that we applied the text manipulation function 
new_df["Desc proc"] = new_df["Desc"].apply(preprocess_sentences) 
final_data = new_df[["Job Url", "Desc proc"]]


### After data preprocessing, we are ready to develop the recomandation model. I will use the Cosine similarity metric in order to determine text similarity

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

count = CountVectorizer()
count_matrix = count.fit_transform(final_data['Desc proc'])
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.04126661 0.18151655 ... 0.08260162 0.07620499 0.10476454]
 [0.04126661 1.         0.10875201 ... 0.14913013 0.0880522  0.09126919]
 [0.18151655 0.10875201 1.         ... 0.18950196 0.14201333 0.15494925]
 ...
 [0.08260162 0.14913013 0.18950196 ... 1.         0.26988337 0.12259438]
 [0.07620499 0.0880522  0.14201333 ... 0.26988337 1.         0.07983581]
 [0.10476454 0.09126919 0.15494925 ... 0.12259438 0.07983581 1.        ]]


In [95]:
## Finding the index of the candidate's cv.
def get_index_from_url(url):
    return final_data[final_data["Job Url"] == url].index.values[0]
    
url = get_index_from_url(email)

In [96]:
## Creating a list with similarily of our CV with the job description
similar_jobs = list(enumerate(cosine_sim[url]))

## Sort the list in descending order
sorted_similar_jobs = sorted(similar_jobs, key=lambda x:x[1], reverse=True)
sorted_similar_jobs

[(171, 0.9999999999999986),
 (109, 0.26764693139028045),
 (13, 0.2418728449756597),
 (17, 0.23551119173538854),
 (14, 0.2288626653694555),
 (149, 0.22623995411025635),
 (15, 0.22403305120122177),
 (21, 0.22403305120122177),
 (146, 0.22399601788396722),
 (23, 0.22154790046665213),
 (24, 0.21782118162804753),
 (27, 0.21782118162804753),
 (28, 0.21782118162804753),
 (29, 0.21782118162804753),
 (30, 0.21782118162804753),
 (32, 0.21782118162804753),
 (33, 0.21782118162804753),
 (36, 0.21782118162804753),
 (38, 0.21782118162804753),
 (39, 0.21782118162804753),
 (41, 0.21782118162804753),
 (42, 0.21782118162804753),
 (43, 0.21782118162804753),
 (44, 0.21782118162804753),
 (45, 0.21782118162804753),
 (52, 0.21782118162804753),
 (132, 0.21746852605578149),
 (118, 0.21670406454457042),
 (37, 0.21532761076060403),
 (25, 0.21181345437818172),
 (84, 0.21115368309936872),
 (135, 0.20586606098082774),
 (164, 0.204609985676822),
 (35, 0.2013488589318756),
 (11, 0.19818626434583295),
 (154, 0.196182886

In [97]:
## Printing some jobs that are fitting better to your CV
pd.options.display.max_colwidth = 1000

def get_title_from_url(index):
    return final_data[final_data.index == index]["Job Url"]
i=0
for job in sorted_similar_jobs:
    print(get_title_from_url(job[0]))
    i=i+1
    if i>5:
        break

171    name@hotmail.com
Name: Job Url, dtype: object
109    http://www.indeed.com/company/CentraNic-Ltd/jobs/Group-Financial-Data-Analyst-59999e06fd4b9009?fccid=004345d9813bd437&vjs=3
Name: Job Url, dtype: object
13    http://www.indeed.com/company/CriterionCapital/jobs/Office-Administration-Assistant-64da8cde37d422d9?fccid=1931435125b82129&vjs=3
Name: Job Url, dtype: object
17    http://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BIQv-klv4x57wzcCCXZDuUs4ETBBTY7U4BZbqajjMT5rLx4iIBIgIDjvqIt6UO8LKeIOY33Wnt4_eGGFmqJeUFdqLBu7U5oyAp-J0dXDp4UiTLVL041HcriHxDT6myJ6B1t5jySkfSP0xrQ1MSGJug_oWZSIBng5uU3tgIaZmdrw1f0HFsYk5o_w5zejOWcDSmC6lgzvZJ6vOa82rSFId3FatHT_qfXMi-PufkEZX4WyY6n0oncWV21jlODJXWsKuoJYw7GGwET4yAfy66eZiJmeyDU1xf9Dt8-V27KKcbAybIWSHq7Mjgv8OjIYZvnDEPVvDV5XHAtg10Eimq-WRh4abc6WEORw6KeHRnDyo2JYUI_WLt98BE3GEtIU9J_2zOiS1fykc7VgdRV6Zb9Gbk5l3k44WDTbVg7sH7h6rRjCG67hJvDDgIMKNCQzMTdIY=&p=2&fvj=1&vjs=3
Name: Job Url, dtype: object
14    http://www.indeed.com/company/Agina-ltd/jobs/Office-Administrat

In [98]:
## Take only the links and send an email to the person that is looking for a position similar to their CV 
df = pd.DataFrame(sorted_similar_jobs[1:6], columns=["index", "similarity"])
df    

Unnamed: 0,index,similarity
0,109,0.267647
1,13,0.241873
2,17,0.235511
3,14,0.228863
4,149,0.22624


In [99]:
## Above is a dataframe with the job's index and the degree of job similarity based on the CV that I uploaded
## I need the Links in a text shape in order to create the message, thus I found the links using the data frame indexes
## and I saved them into a text file

text = []
for i in df["index"]:
    text.append(final_data["Job Url"].iloc[i])
    with open("message.txt", 'w') as f:
         f.write("\n\n".join(map(str, text)))

file = open('message.txt','r')

#read the numbers on the file
body = file.read()

#Close the the numbers file
file.close()

#Print the Links, in other words the text that I will produce the messages
print(body)

http://www.indeed.com/company/CentraNic-Ltd/jobs/Group-Financial-Data-Analyst-59999e06fd4b9009?fccid=004345d9813bd437&vjs=3

http://www.indeed.com/company/CriterionCapital/jobs/Office-Administration-Assistant-64da8cde37d422d9?fccid=1931435125b82129&vjs=3

http://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BIQv-klv4x57wzcCCXZDuUs4ETBBTY7U4BZbqajjMT5rLx4iIBIgIDjvqIt6UO8LKeIOY33Wnt4_eGGFmqJeUFdqLBu7U5oyAp-J0dXDp4UiTLVL041HcriHxDT6myJ6B1t5jySkfSP0xrQ1MSGJug_oWZSIBng5uU3tgIaZmdrw1f0HFsYk5o_w5zejOWcDSmC6lgzvZJ6vOa82rSFId3FatHT_qfXMi-PufkEZX4WyY6n0oncWV21jlODJXWsKuoJYw7GGwET4yAfy66eZiJmeyDU1xf9Dt8-V27KKcbAybIWSHq7Mjgv8OjIYZvnDEPVvDV5XHAtg10Eimq-WRh4abc6WEORw6KeHRnDyo2JYUI_WLt98BE3GEtIU9J_2zOiS1fykc7VgdRV6Zb9Gbk5l3k44WDTbVg7sH7h6rRjCG67hJvDDgIMKNCQzMTdIY=&p=2&fvj=1&vjs=3

http://www.indeed.com/company/Agina-ltd/jobs/Office-Administrator-d3d925d98ebd3d3d?fccid=33d409a0d113e3d3&vjs=3

http://www.indeed.com/company/Builder-Depot/jobs/Packing-Dispatch-Warehouse-Assistant-4128a5380a31f2fe?fccid=9ee

### The final step is to send an email to the candidate using the smtplib module. I also attached an image at the bottom of the email.

In [100]:
## First step, open my gmail account
## Read a file with my personal info
file = open("my_personal_file.txt")
lines = file.readlines()

In [102]:
import smtplib
import imghdr
from email.message import EmailMessage
Sender_Email = "nikoskalikis@gmail.com"

# You can send the emails with two methods, 1) By sending an email to every person in your list, 
## so everyone will be able to see the other person that you have in your list
## OR 2) to every person separately which is the method that I used.

Reciever_Email = ["nikoskalikis@gmail.com", "despoina615@hotmail.com"]
Password = lines[1]
for i in Reciever_Email:
    try:
        newMessage = EmailMessage()                         
        newMessage['Subject'] = "Check some new positions for you!!!" 
        newMessage['From'] = Sender_Email                   
        newMessage['To'] = i                   
        newMessage.set_content(f"Apply to the following positions\n\n " + body) 
        with open('logo.png', 'rb') as f:
            image_data = f.read()
            image_type = imghdr.what(f.name)
            image_name = f.name
        newMessage.add_attachment(image_data, maintype='image', subtype=image_type, filename=image_name)
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp:

            smtp.login(Sender_Email, Password)              
            smtp.send_message(newMessage)
            print("Successfully Sent email !!!")
    except Exception:
           print("Error: unable to send email")

Successfully Sent email !!!
Successfully Sent email !!!
