# Data Loading and Cleaning

In [37]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('job_descriptions.csv')  # Replace 'job_descriptions.csv' with the actual file path
df = df.head(20000)

# Preprocess the data
df['skills'] = df['skills'].apply(lambda x: x.split(','))

# Function to remove punctuation and stopwords
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Join the tokens back into a string
    return ' '.join(filtered_tokens)

# Apply text preprocessing to the Job Description column
df['Job Description'] = df['Job Description'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Job Description'], df['skills'], test_size=0.2, random_state=42)

# Initialize the CountVectorizer for text feature extraction / tokenization
vectorizer = CountVectorizer()

# Initialize the MultiLabelBinarizer for encoding the skills
mlb = MultiLabelBinarizer()

# Transform the skills sets using MultiLabelBinarizer
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

# Define the LogisticRegression model wrapped in MultiOutputClassifier for multi-label classification
model = MultiOutputClassifier(LogisticRegression(solver='lbfgs'))

# Create a pipeline that first vectorizes the text and then applies the classifier
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', model)
])

# Train the model
pipeline.fit(X_train, X_test)

# Predict the skills for the test set
y_pred = pipeline.predict(y_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

[nltk_data] Downloading package punkt to /Users/seby/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/seby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: y must have at least two dimensions for multi-output regression but has only one.

In [36]:
# Example of predicting skills for a new job description
new_job_description = preprocess_text(df["Job Description"].iloc[0])
new_job_description = "We are looking for a front-end software engineer with experience in Python, Java, and C++, leadership, and marketing."

#  expected_skills = df["skills"].iloc[0]
print(f"Job Description: {new_job_description}")
# print(f"Expected skills: {expected_skills}")
predicted_skills = pipeline.predict([new_job_description])
print(f'Predicted Skills: {mlb.inverse_transform(predicted_skills)}')

Job Description: We are looking for a front-end software engineer with experience in Python, Java, and C++, leadership, and marketing.
Predicted Skills: [()]


In [27]:
import pandas as pd
import spacy
from spacy.training import Example
import random

# Load your dataset
df = pd.read_csv('job_descriptions.csv').head(20000)

# Preprocess the dataset
# Convert the Skills column into a list
df['skills'] = df['skills'].apply(lambda x: x.split(','))

# Function to preprocess the data into Spacy's required format
def preprocess_data(df):
    data = []
    for index, row in df.iterrows():
        entities = []
        for skill in row['skills']:
            if skill in row['Job Description']:
                start = row['Job Description'].index(skill)
                end = start + len(skill)
                entities.append((start, end, 'SKILL'))
        if entities:
            data.append((row['Job Description'], {'entities': entities}))
    return data

# Convert the dataset
TRAIN_DATA = preprocess_data(df)

# Load a blank Spacy model
nlp = spacy.blank('en')

# Create a new NER pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

# Add the label 'SKILL' to the NER
ner.add_label('SKILL')

# Disable other pipelines during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# Training the NER model
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
        print(losses)

# Save the trained model
nlp.to_disk('ner_skill_model')



{'ner': 90.60213024474773}
{'ner': 2.1948040794394566e-06}
{'ner': 2.3259453398928506e-08}
{'ner': 6.63459987229098e-07}
{'ner': 7.02590685384092e-08}
{'ner': 7.20473847071517e-09}
{'ner': 6.127857370880504e-10}
{'ner': 1.51065689790898e-06}
{'ner': 6.353318021424173e-08}
{'ner': 1.4642113375719021e-09}


In [29]:
# Load the trained model
nlp = spacy.load('ner_skill_model')

# New job description
example = df["Job Description"].iloc[0]
expected_skills = df["skills"].iloc[0]
doc = nlp(example)
print(f"Job Description: {example}")
print(f"Expected skills: {expected_skills}")

# Print detected skills
print("Detected skills:")
for ent in doc.ents:
    if ent.label_ == 'SKILL':
        print(ent.text)

Job Description: Social Media Managers oversee an organizations social media presence. They create and schedule content, engage with followers, and analyze social media metrics to drive brand awareness and engagement.
Expected skills: ['Social media platforms (e.g.', ' Facebook', ' Twitter', ' Instagram) Content creation and scheduling Social media analytics and insights Community engagement Paid social advertising']
Detected skills:
