In [None]:
import pandas as pd

# Path to the CSV file
file_path = '/content/drive/MyDrive/Term 3 NLP/NLP ATS Project/db1.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,ID,Resume_str,Category,job_id,title,description,ATS_score
0,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905367422,Trademark Attorney,Junior Trademark Associate\nOur client is a to...,21
1,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3887888322,Delivery Driver / CDL A required / Seasonal,PBNA $25.75 / hour\n\nCLICK HERE to view our D...,24
2,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905243094,Senior Recruiter,Hit a glass ceiling in your earning potential ...,47
3,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905323971,Order Fulfillment Coordinator,Are you ready to be a crucial part of our dyna...,51
4,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3903830212,Medical Assistant Urgent Care Per Diem,"As a physician-founded and led organization, e...",38


# Data Cleaning

In [None]:
import re

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    # Remove IP addresses
    text = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove other special characters
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    return text

# Apply the cleaning function to the Resume_str and description columns
df['cleaned_resume'] = df['Resume_str'].apply(clean_text)
df['cleaned_description'] = df['description'].apply(clean_text)

# Display the cleaned columns
df[['cleaned_resume', 'cleaned_description']].head()


Unnamed: 0,cleaned_resume,cleaned_description
0,SENIOR INFORMATION TECHNOLOGY MANAGER...,Junior Trademark Associate\nOur client is a to...
1,SENIOR INFORMATION TECHNOLOGY MANAGER...,PBNA 2575 hour\n\nCLICK HERE to view our Driv...
2,SENIOR INFORMATION TECHNOLOGY MANAGER...,Hit a glass ceiling in your earning potential ...
3,SENIOR INFORMATION TECHNOLOGY MANAGER...,Are you ready to be a crucial part of our dyna...
4,SENIOR INFORMATION TECHNOLOGY MANAGER...,As a physicianfounded and led organization ens...


# Handle Slang, Emoji, and Abbreviations

In [None]:
!pip install emot
from emot.emo_unicode import UNICODE_EMOJI

# Function to handle emojis
def replace_emojis(text):
    for emoji in UNICODE_EMOJI:
        text = text.replace(emoji, UNICODE_EMOJI[emoji])
    return text

# Slang and abbreviations dictionary
slang_dict = {
    "u": "you",
    "r": "are",
}

def replace_slang(text):
    words = text.split()
    new_words = [slang_dict[word.lower()] if word.lower() in slang_dict else word for word in words]
    return ' '.join(new_words)

# Apply the functions to the cleaned columns
df['cleaned_resume'] = df['cleaned_resume'].apply(replace_emojis).apply(replace_slang)
df['cleaned_description'] = df['cleaned_description'].apply(replace_emojis).apply(replace_slang)

# Display the processed columns
df[['cleaned_resume', 'cleaned_description']].head()


Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m734.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emot
Successfully installed emot-3.1


Unnamed: 0,cleaned_resume,cleaned_description
0,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Junior Trademark Associate Our client is a top...
1,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,PBNA 2575 hour CLICK HERE to view our Driver J...
2,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Hit a glass ceiling in your earning potential ...
3,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Are you ready to be a crucial part of our dyna...
4,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,As a physicianfounded and led organization ens...


# Named Entity Recognition (NER)

In [None]:
!pip install spacy
import spacy

# Load the spacy model
nlp = spacy.load('en_core_web_sm')

def ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply NER
df['resume_entities'] = df['cleaned_resume'].apply(ner)
df['description_entities'] = df['cleaned_description'].apply(ner)

# Display entities
df[['resume_entities', 'description_entities']].head()




Unnamed: 0,resume_entities,description_entities
0,"[(Information Technology, ORG), (years, DATE),...","[(New York, GPE), (Silicon Valley, LOC), (San ..."
1,"[(Information Technology, ORG), (years, DATE),...","[(2575 hour, TIME), (Driver Job Preview Video ..."
2,"[(Information Technology, ORG), (years, DATE),...","[(Break, GPE), (67 consecutive years, DATE), (..."
3,"[(Information Technology, ORG), (years, DATE),...","[(Order Fulfillment Coordinator, ORG), (daily,..."
4,"[(Information Technology, ORG), (years, DATE),...","[(120, PRODUCT), (1 million, CARDINAL), (todat..."


# Spell Correction (Avoid NER)

In [None]:
'''!pip install textblob
from textblob import TextBlob

def correct_spelling(text):
    textblob = TextBlob(text)
    return str(textblob.correct())

# Apply spell correction
df['corrected_resume'] = df['cleaned_resume'].apply(correct_spelling)
df['corrected_description'] = df['cleaned_description'].apply(correct_spelling)

# Display the corrected columns
df[['corrected_resume', 'corrected_description']].head()




# Handle Contraction Negation

In [None]:
'''!pip install contractions
import contractions

def expand_contractions(text):
    return contractions.fix(text)

# Apply contraction expansion
df['expanded_resume'] = df['corrected_resume'].apply(expand_contractions)
df['expanded_description'] = df['corrected_description'].apply(expand_contractions)

# Display the expanded columns
df[['expanded_resume', 'expanded_description']].head()


# Handle Punctuation with Regex

In [None]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# Apply punctuation removal
df['no_punctuation_resume'] = df['cleaned_resume'].apply(remove_punctuation)
df['no_punctuation_description'] = df['cleaned_description'].apply(remove_punctuation)

# Display the no punctuation columns
df[['no_punctuation_resume', 'no_punctuation_description']].head()


Unnamed: 0,no_punctuation_resume,no_punctuation_description
0,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Junior Trademark Associate Our client is a top...
1,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,PBNA 2575 hour CLICK HERE to view our Driver J...
2,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Hit a glass ceiling in your earning potential ...
3,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Are you ready to be a crucial part of our dyna...
4,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,As a physicianfounded and led organization ens...


# Tokenization


In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text)

# Apply tokenization
df['tokenized_resume'] = df['no_punctuation_resume'].apply(tokenize)
df['tokenized_description'] = df['no_punctuation_description'].apply(tokenize)

# Display the tokenized columns
df[['tokenized_resume', 'tokenized_description']].head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,tokenized_resume,tokenized_description
0,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[Junior, Trademark, Associate, Our, client, is..."
1,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[PBNA, 2575, hour, CLICK, HERE, to, view, our,..."
2,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[Hit, a, glass, ceiling, in, your, earning, po..."
3,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[Are, you, ready, to, be, a, crucial, part, of..."
4,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[As, a, physicianfounded, and, led, organizati..."


In [None]:
# Save the processed DataFrame to a new CSV file
output_file_path = '/content/drive/MyDrive/Term 3 NLP/NLP ATS Project/processed_db1.csv'
df.to_csv(output_file_path, index=False)


In [None]:
import pandas as pd
from google.colab import drive


# Path to the saved processed CSV file
processed_file_path = '/content/drive/MyDrive/Term 3 NLP/NLP ATS Project/processed_db1.csv'

# Read the processed CSV file
df_processed = pd.read_csv(processed_file_path)

# Display the first few rows of the processed dataframe
df_processed.head()


Unnamed: 0,ID,Resume_str,Category,job_id,title,description,ATS_score,cleaned_resume,cleaned_description,resume_entities,description_entities,no_punctuation_resume,no_punctuation_description,tokenized_resume,tokenized_description
0,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905367422,Trademark Attorney,Junior Trademark Associate\nOur client is a to...,21,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Junior Trademark Associate Our client is a top...,"[('Information Technology', 'ORG'), ('years', ...","[('New York', 'GPE'), ('Silicon Valley', 'LOC'...",SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Junior Trademark Associate Our client is a top...,"['SENIOR', 'INFORMATION', 'TECHNOLOGY', 'MANAG...","['Junior', 'Trademark', 'Associate', 'Our', 'c..."
1,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3887888322,Delivery Driver / CDL A required / Seasonal,PBNA $25.75 / hour\n\nCLICK HERE to view our D...,24,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,PBNA 2575 hour CLICK HERE to view our Driver J...,"[('Information Technology', 'ORG'), ('years', ...","[('2575 hour', 'TIME'), ('Driver Job Preview V...",SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,PBNA 2575 hour CLICK HERE to view our Driver J...,"['SENIOR', 'INFORMATION', 'TECHNOLOGY', 'MANAG...","['PBNA', '2575', 'hour', 'CLICK', 'HERE', 'to'..."
2,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905243094,Senior Recruiter,Hit a glass ceiling in your earning potential ...,47,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Hit a glass ceiling in your earning potential ...,"[('Information Technology', 'ORG'), ('years', ...","[('Break', 'GPE'), ('67 consecutive years', 'D...",SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Hit a glass ceiling in your earning potential ...,"['SENIOR', 'INFORMATION', 'TECHNOLOGY', 'MANAG...","['Hit', 'a', 'glass', 'ceiling', 'in', 'your',..."
3,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905323971,Order Fulfillment Coordinator,Are you ready to be a crucial part of our dyna...,51,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Are you ready to be a crucial part of our dyna...,"[('Information Technology', 'ORG'), ('years', ...","[('Order Fulfillment Coordinator', 'ORG'), ('d...",SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,Are you ready to be a crucial part of our dyna...,"['SENIOR', 'INFORMATION', 'TECHNOLOGY', 'MANAG...","['Are', 'you', 'ready', 'to', 'be', 'a', 'cruc..."
4,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3903830212,Medical Assistant Urgent Care Per Diem,"As a physician-founded and led organization, e...",38,SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,As a physicianfounded and led organization ens...,"[('Information Technology', 'ORG'), ('years', ...","[('120', 'PRODUCT'), ('1 million', 'CARDINAL')...",SENIOR INFORMATION TECHNOLOGY MANAGER Executiv...,As a physicianfounded and led organization ens...,"['SENIOR', 'INFORMATION', 'TECHNOLOGY', 'MANAG...","['As', 'a', 'physicianfounded', 'and', 'led', ..."


# Parsing (Word Substring Extraction)

In [None]:
import spacy

# Load the spacy model
nlp = spacy.load('en_core_web_sm')

def parse_text(text):
    doc = nlp(text)
    parsed_text = [{'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_, 'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha, 'is_stop': token.is_stop} for token in doc]
    return parsed_text

# Apply parsing
df['parsed_resume'] = df['no_punctuation_resume'].apply(parse_text)
df['parsed_description'] = df['no_punctuation_description'].apply(parse_text)

# Display parsed columns
df[['parsed_resume', 'parsed_description']].head()


Unnamed: 0,parsed_resume,parsed_description
0,"[{'text': 'SENIOR', 'lemma': 'senior', 'pos': ...","[{'text': 'Junior', 'lemma': 'Junior', 'pos': ..."
1,"[{'text': 'SENIOR', 'lemma': 'senior', 'pos': ...","[{'text': 'PBNA', 'lemma': 'PBNA', 'pos': 'PRO..."
2,"[{'text': 'SENIOR', 'lemma': 'senior', 'pos': ...","[{'text': 'Hit', 'lemma': 'hit', 'pos': 'VERB'..."
3,"[{'text': 'SENIOR', 'lemma': 'senior', 'pos': ...","[{'text': 'Are', 'lemma': 'be', 'pos': 'AUX', ..."
4,"[{'text': 'SENIOR', 'lemma': 'senior', 'pos': ...","[{'text': 'As', 'lemma': 'as', 'pos': 'ADP', '..."


# Remove Stop Words

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

# Apply stop word removal
df['no_stopwords_resume'] = df['tokenized_resume'].apply(remove_stopwords)
df['no_stopwords_description'] = df['tokenized_description'].apply(remove_stopwords)

# Display no stopwords columns
df[['no_stopwords_resume', 'no_stopwords_description']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,no_stopwords_resume,no_stopwords_description
0,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[Junior, Trademark, Associate, client, top, fi..."
1,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[PBNA, 2575, hour, CLICK, view, Driver, Job, P..."
2,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[Hit, glass, ceiling, earning, potential, inte..."
3,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[ready, crucial, part, dynamic, team, seeking,..."
4,"[SENIOR, INFORMATION, TECHNOLOGY, MANAGER, Exe...","[physicianfounded, led, organization, ensuring..."


# Lemmatization

In [None]:
def lemmatize(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

# Apply lemmatization
df['lemmatized_resume'] = df['no_stopwords_resume'].apply(lemmatize)
df['lemmatized_description'] = df['no_stopwords_description'].apply(lemmatize)

# Display lemmatized columns
df[['lemmatized_resume', 'lemmatized_description']].head()


Unnamed: 0,lemmatized_resume,lemmatized_description
0,"[senior, information, TECHNOLOGY, MANAGER, Exe...","[Junior, Trademark, Associate, client, top, fi..."
1,"[senior, information, TECHNOLOGY, MANAGER, Exe...","[PBNA, 2575, hour, CLICK, view, Driver, Job, P..."
2,"[senior, information, TECHNOLOGY, MANAGER, Exe...","[hit, glass, ceiling, earn, potential, interna..."
3,"[senior, information, TECHNOLOGY, MANAGER, Exe...","[ready, crucial, part, dynamic, team, seek, ta..."
4,"[senior, information, TECHNOLOGY, MANAGER, Exe...","[physicianfounde, lead, organization, ensure, ..."


# Lowercase

In [None]:
def to_lowercase(tokens):
    return [word.lower() for word in tokens]

# Apply lowercase
df['lowercase_resume'] = df['lemmatized_resume'].apply(to_lowercase)
df['lowercase_description'] = df['lemmatized_description'].apply(to_lowercase)

# Display lowercase columns
df[['lowercase_resume', 'lowercase_description']].head()


Unnamed: 0,lowercase_resume,lowercase_description
0,"[senior, information, technology, manager, exe...","[junior, trademark, associate, client, top, fi..."
1,"[senior, information, technology, manager, exe...","[pbna, 2575, hour, click, view, driver, job, p..."
2,"[senior, information, technology, manager, exe...","[hit, glass, ceiling, earn, potential, interna..."
3,"[senior, information, technology, manager, exe...","[ready, crucial, part, dynamic, team, seek, ta..."
4,"[senior, information, technology, manager, exe...","[physicianfounde, lead, organization, ensure, ..."


# N-grams

In [None]:
from nltk import ngrams

def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Apply bigrams and trigrams
df['bigrams_resume'] = df['lowercase_resume'].apply(lambda x: generate_ngrams(x, 2))
df['trigrams_resume'] = df['lowercase_resume'].apply(lambda x: generate_ngrams(x, 3))
df['bigrams_description'] = df['lowercase_description'].apply(lambda x: generate_ngrams(x, 2))
df['trigrams_description'] = df['lowercase_description'].apply(lambda x: generate_ngrams(x, 3))

# Display n-grams columns
df[['bigrams_resume', 'trigrams_resume', 'bigrams_description', 'trigrams_description']].head()


Unnamed: 0,bigrams_resume,trigrams_resume,bigrams_description,trigrams_description
0,"[(senior, information), (information, technolo...","[(senior, information, technology), (informati...","[(junior, trademark), (trademark, associate), ...","[(junior, trademark, associate), (trademark, a..."
1,"[(senior, information), (information, technolo...","[(senior, information, technology), (informati...","[(pbna, 2575), (2575, hour), (hour, click), (c...","[(pbna, 2575, hour), (2575, hour, click), (hou..."
2,"[(senior, information), (information, technolo...","[(senior, information, technology), (informati...","[(hit, glass), (glass, ceiling), (ceiling, ear...","[(hit, glass, ceiling), (glass, ceiling, earn)..."
3,"[(senior, information), (information, technolo...","[(senior, information, technology), (informati...","[(ready, crucial), (crucial, part), (part, dyn...","[(ready, crucial, part), (crucial, part, dynam..."
4,"[(senior, information), (information, technolo...","[(senior, information, technology), (informati...","[(physicianfounde, lead), (lead, organization)...","[(physicianfounde, lead, organization), (lead,..."


In [None]:
# Save the processed DataFrame to a new CSV file
output_file_path = '/content/drive/MyDrive/Term 3 NLP/NLP ATS Project/processed_db1_v2.csv'
df.to_csv(output_file_path, index=False)
