In [2]:
# If you're running locally and don't have these libraries installed, you can uncomment these
# !pip install pyresparser Flask numpy pandas nltk sklearn ftfy spacy python-docx

# Downloading NLTK data
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from pyresparser import ResumeParser
from docx import Document
import numpy as np
import pandas as pd
import re
from ftfy import fix_text
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy


In [4]:
stopw = set(stopwords.words('english'))  # Load English stopwords


In [5]:
# Assuming 'job_final.csv' is present in the working directory
df = pd.read_csv('data.csv')

# Clean job descriptions by removing stopwords and short words (<3 characters)
df['test'] = df['Job_Description'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word) > 2 and word not in stopw]))

# Print to verify
print(df['Location'])


0    Noida
1    Noida
2    Noida
Name: Location, dtype: object


In [10]:
file_path = 'test2.pdf'

# Open and process the resume document (simulating file upload)
try:
    # Simply parse the resume without specifying custom NLP models
    data = ResumeParser(file_path).get_extracted_data()
    print("PDF processed successfully")
    
except Exception as e:
    print("Error processing PDF document:", e)
    data = {}

# Extract skills from the resume if parsing was successful
resume = data.get('skills', [])
print("Skills extracted:", resume)




PDF processed successfully
Skills extracted: ['Tableau', 'Testing', 'Economics', 'Marketing', 'Seo', 'Experiments', 'Excel', 'Analysis', 'Reports', 'Retention', 'Pandas', 'Improvement', 'Python', 'Mathematics', 'Kpis', 'Modeling', 'Conversion', 'Analytics', 'Sql', 'R', 'Reporting']


In [11]:
# Prepare resume skills as a single string
skills = []
skills.append(' '.join(word for word in resume))
org_name_clean = skills

# Define a function to generate n-grams from text (used later for vectorization)
def ngrams(string, n=3):
    string = fix_text(string)
    string = string.encode("ascii", errors="ignore").decode()
    string = string.lower()
    chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()
    string = re.sub(' +', ' ', string).strip()
    string = ' ' + string + ' '
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

# Vectorize the resume skills using TF-IDF
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(org_name_clean)
print('Vectorizing completed...')


Vectorizing completed...


In [12]:
# Function to find nearest neighbors based on the resume's skills
def getNearestN(query):
    queryTFIDF_ = vectorizer.transform(query)
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

# Fit the Nearest Neighbors model with the job descriptions
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

# Prepare job descriptions for matching
unique_org = df['test'].values
distances, indices = getNearestN(unique_org)

# Output the nearest matches
matches = []
for i, j in enumerate(indices):
    dist = round(distances[i][0], 2)
    temp = [dist]
    matches.append(temp)

# Convert the match results to a DataFrame for easy handling
matches = pd.DataFrame(matches, columns=['Match confidence'])
df['match'] = matches['Match confidence']
df1 = df.sort_values('match')
df2 = df1[['Position', 'Company', 'Location', 'url']].head(10).reset_index()

# Clean up Location column
df2['Location'] = df2['Location'].str.replace(r'[^\x00-\x7F]', '', regex=True)
df2['Location'] = df2['Location'].str.replace("â€“", "")

# Display the top 10 matched jobs
df2


Unnamed: 0,index,Position,Company,Location,url
0,0,Android App Developer Intern,Constems-AI,Noida,https://www.glassdoor.co.in/job-listing/andoid...
1,2,ReactJS Developer Intern,Constems-AI,Noida,https://www.glassdoor.co.in/job-listing/reactj...
2,1,Backend Developer Intern,INTERNATIONAL YOUTH EDU-SKILLS FOUNDATION,Noida,https://www.glassdoor.co.in/job-listing/backen...


In [9]:
# Extract unique locations for the dropdown (if needed in an application)
dropdown_locations = sorted(df2['Location'].unique())

# Create a list of jobs to display
job_list = []
for index, row in df2.iterrows():
    job_list.append({
        'Position': row['Position'],
        'Company': row['Company'],
        'Location': row['Location'],
        'Apply Link': row['url']
    })

# Display the list of job matches
job_list


[{'Position': 'Backend Developer Intern',
  'Company': 'INTERNATIONAL YOUTH EDU-SKILLS FOUNDATION',
  'Location': 'Noida',
  'Apply Link': 'https://www.glassdoor.co.in/job-listing/backend-developer-intern-international-youth-edu-skills-foundation-JV_IC4477468_KO0,24_KE25,66.htm?jl=1009377074499'},
 {'Position': 'Android App Developer Intern',
  'Company': 'Constems-AI',
  'Location': 'Noida',
  'Apply Link': 'https://www.glassdoor.co.in/job-listing/andoid-app-developer-intern-constems-ai-JV_IC4477468_KO0,27_KE28,39.htm?jl=1009485312339'},
 {'Position': 'ReactJS Developer Intern',
  'Company': 'Constems-AI',
  'Location': 'Noida',
  'Apply Link': 'https://www.glassdoor.co.in/job-listing/reactjs-developer-intern-constems-ai-JV_IC4477468_KO0,24_KE25,36.htm?jl=1009395932098'}]