In [11]:
import pandas as pd 
import re
import string

In [12]:
df=pd.read_csv("resume_job_dataset.csv")

In [13]:
df.shape

(10000, 3)

## This is how our dataset looks like 

In [15]:
df.head()

Unnamed: 0,job_description,resume,match_score
0,"Data Analyst needed with experience in SQL, Ex...","Experienced professional skilled in sql, excel...",4
1,Data Scientist needed with experience in Stati...,Experienced professional skilled in statistics...,4
2,Software Engineer needed with experience in Sy...,Experienced professional skilled in system des...,5
3,"ML Engineer needed with experience in Python, ...","Experienced professional skilled in python, co...",4
4,Software Engineer needed with experience in RE...,"Experienced professional skilled in rest apis,...",5


### Text Cleaning Function
The `clean_text(df)` function converts all text in job descriptions and resumes to lowercase, removes punctuation, and replaces multiple spaces or newlines with a single space. This ensures the text data is clean and uniform for further processing.

In [16]:
def clean_text(df):
    #extracting job descriptions and resumes from dataframe
    job_desc=df["job_description"]
    resume=df["resume"]
    
    #converting all text to lowercase 
    job_desc_lower=[t.lower() for t in job_desc]
    resume_lower=[t.lower() for t in resume]

    #removing punctuation by replacing it with spaces
    pattern = r'[{}]'.format(re.escape(string.punctuation))
    job_desc_clean = [re.sub(pattern, ' ', t) for t in job_desc_lower]
    resume_clean   = [re.sub(pattern, ' ', t) for t in resume_lower]

    #replacing  multiple spaces/tabs/newlines with a single space and trim ends
    pattern=r'\s+'
    job_desc_clean = [re.sub(pattern, ' ', t).strip() for t in job_desc_clean]
    resume_clean   = [re.sub(pattern, ' ', t).strip() for t in resume_clean]

    df["job_description"]=job_desc_clean
    df["resume"]=resume_clean
    return df 

In [17]:
df=clean_text(df)

### Tokenization and Stopword Removal
The `tokenize_and_remove_stopwords(df)` function tokenizes job descriptions and resumes into individual words, removes English stopwords, and keeps only alphabetic tokens. This step helps focus on meaningful words for text analysis or model training.


In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

import nltk
nltk.download("punkt")
nltk.download("punkt_tab")   
nltk.download("stopwords")
nltk.download("wordnet")

def tokenize_and_remove_stopwords(df):

    job_desc=df["job_description"]
    resume=df["resume"]
  
    tokens_job_desc = [word_tokenize(t) for t in job_desc]
    tokens_resume   = [word_tokenize(t) for t in resume]

    # load English stopwords
    stop_words = set(stopwords.words("english"))

    # filter out stopwords + keep only alphabetic tokens
    filtered_job_desc = [[w for w in tokens if w.isalpha() and w not in stop_words]
                         for tokens in tokens_job_desc]
    filtered_resume   = [[w for w in tokens if w.isalpha() and w not in stop_words]
                         for tokens in tokens_resume]

    return filtered_job_desc, filtered_resume

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Lemmatization of Tokens
The `lemmatize_tokens(job_tokens, resume_tokens)` function applies lemmatization to tokenized job descriptions and resumes, converting words to their base form. This reduces word variations and helps in consistent text analysis.


In [19]:
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(job_tokens, resume_tokens):
   
    lemmatized_job_desc = [[lemmatizer.lemmatize(w) for w in tokens]
                           for tokens in job_tokens]
    lemmatized_resume   = [[lemmatizer.lemmatize(w) for w in tokens]
                           for tokens in resume_tokens]
    
    return lemmatized_job_desc, lemmatized_resume

In [20]:
l1,l2=tokenize_and_remove_stopwords(df)

In [21]:
l3,l4=lemmatize_tokens(l1,l2)


In [23]:
df["job_description"]=l4
df["resume"]=l3

## final dataset after removing all stop wrods converting all text into lower case and then doing  Lemmatization

In [25]:
df.head()

Unnamed: 0,job_description,resume,match_score
0,"[experienced, professional, skilled, sql, exce...","[data, analyst, needed, experience, sql, excel...",4
1,"[experienced, professional, skilled, statistic...","[data, scientist, needed, experience, statisti...",4
2,"[experienced, professional, skilled, system, d...","[software, engineer, needed, experience, syste...",5
3,"[experienced, professional, skilled, python, c...","[ml, engineer, needed, experience, python, com...",4
4,"[experienced, professional, skilled, rest, api...","[software, engineer, needed, experience, rest,...",5


##  Since machine learning models cannot process raw text directly, we convert the cleaned text into numerical representations that the models can understand. This feature conversion is implemented in the convert_features.ipynb .