In [1]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import string
import pandas as pd

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') 
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lintaromiyashin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lintaromiyashin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lintaromiyashin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lintaromiyashin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lintaromiyashin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# Function to get POS tag for lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


In [3]:
# Function to find top 10 keywords from a job description text
def find_top_keywords(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Convert words to lower case
    words = [word.lower() for word in words]

    # Initialize the WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if word.isalpha()]

    # Remove stopwords, punctuation
    stop_words = set(stopwords.words('english'))
    # Remove words that often appear in any job descriptions but have no impactful meaning
    custom_words = {"get", "team", "work", "help", "responsibilitiy", "role", "experience"}
    #lemmatize words 
    words = [word for word in words if word not in stop_words and word not in custom_words]

    # Count word frequencies
    word_counts = Counter(words)

    # Get the top 10 most common words
    top_keywords = word_counts.most_common(10)

    # Format the keywords as a string
    top_keywords_str = ', '.join([word for word, freq in top_keywords])

    return top_keywords_str

In [4]:
# Function to process the CSV file
def process_csv(input_file_path, output_file_path):
    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Process each job description to find the top 10 keywords
    df['Top 10 Keywords'] = df['Job Description'].apply(find_top_keywords)

    # Write the updated DataFrame back to the new CSV file
    df.to_csv(output_file_path, index=False)


In [5]:
# Example usage
input_file_path = 'joblist.csv'  # input CSV file path
output_file_path = 'joblist_withkeywords.csv'  # output CSV file path
process_csv(input_file_path, output_file_path)