Combine and Clean Text

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load your data
df = pd.read_csv("bootcamps.csv") 

# Combine the relevant fields
combined_fields = ['Title', 'Description', 'Category', 'Scope', 'Goals', 'Features', 'Requirements']
df['combined_text'] = df[combined_fields].astype(str).agg(' '.join, axis=1)

# Clean the text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # remove stopwords
    return text

df['cleaned_text'] = df['combined_text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ghada./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# BERT Embeddings

Bootcamp & Programs

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight & fast
bert_embeddings = model.encode(df['cleaned_text'], show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert to string and clean nulls just to be safe
df['cleaned_text'] = df['cleaned_text'].astype(str)

# Encode all bootcamps at once
bootcamp_embeddings = model.encode(df['cleaned_text'].tolist(), show_progress_bar=True)

# Check shape
print("Bootcamp Embeddings shape:", np.array(bootcamp_embeddings).shape)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s]

Bootcamp Embeddings shape: (15, 384)





CV

In [None]:
cv_text = "I studied computer science and worked on AI projects using Python"

# Result shape: (1, 384)
cv_embedding = model.encode([cv_text])  # wrap in a list

print("CV Embedding shape:", np.array(cv_embedding).shape)