In [3]:
# Step 1: Import required libraries
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [4]:
# Step 2: Load cleaned data
df = pd.read_csv("cleaned_data.csv")



In [8]:
# ✅ Step 3: Take a smaller sample for memory safety (10,000 rows)
df_sample = df.sample(n=10000, random_state=42).reset_index(drop=True)

In [9]:
# Step 4: Combine User_Skills and Job_Requirements into a single 'tags' column
df_sample["tags"] = df_sample["User_Skills"] + " " + df_sample["Job_Requirements"]


In [10]:
# Step 5: Vectorize the tags using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df_sample["tags"]).toarray()




In [11]:
# Step 6: Compute cosine similarity
similarity = cosine_similarity(vectors)


In [12]:
# Step 7: Save model and vectorizer
pickle.dump(df_sample, open("df.pkl", "wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))
pickle.dump(cv, open("vectorizer.pkl", "wb"))

print("✅ Sampled model saved as df.pkl, similarity.pkl, vectorizer.pkl")

✅ Sampled model saved as df.pkl, similarity.pkl, vectorizer.pkl
