In [None]:
features_df=pd.DataFrame()

In [None]:
resume_texts = [" ".join(r) for r in l3]
job_texts    = [" ".join(j) for j in  l4]


### TF-IDF Cosine Similarity
This code computes the similarity between resumes and job descriptions using TF-IDF vectorization. Each resume and job description pair is transformed into TF-IDF vectors, and the row-wise cosine similarity is calculated to measure how closely they match.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

def compute_tfidf_cosine(resume_texts, job_texts):
    vectorizer = TfidfVectorizer()
    combined   = resume_texts + job_texts
    tfidf_matrix = vectorizer.fit_transform(combined)
    n = len(resume_texts)

    resume_tfidf = tfidf_matrix[:n]
    job_tfidf    = tfidf_matrix[n:]

    tfidf_cosine = [
        float(cosine_similarity(resume_tfidf[i], job_tfidf[i])[0][0])
        for i in range(n)
    ]

    return tfidf_cosine


### Jaccard Similarity
This code computes the Jaccard similarity between tokenized resumes (`l3`) and job descriptions (`l4`). Each text is converted to a set of words, and the similarity is calculated as the size of the intersection divided by the size of the union for each resume–job pair.


In [None]:
resume_sets = [set(r) for r in l3]
job_sets    = [set(j) for j in l4]

def compute_jaccard(resume_sets, job_sets):
    def jaccard_sim(a, b):
        return len(a & b) / len(a | b) if len(a | b) > 0 else 0

    jaccard = [
        jaccard_sim(a, b)
        for a, b in zip(resume_sets, job_sets)
    ]

    return jaccard


### Sentence-BERT Cosine Similarity
This code uses the `all-MiniLM-L6-v2` SentenceTransformer model to compute semantic similarity between resumes and job descriptions. It encodes each text into embeddings and calculates cosine similarity for corresponding resume–job pairs to capture meaning-based matches.


In [None]:
from sentence_transformers import SentenceTransformer, util

def compute_sentencebert_cosine(resume_texts, job_texts):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    emb_r = model.encode(resume_texts, convert_to_tensor=True)
    emb_l = model.encode(job_texts, convert_to_tensor=True)

    pairwise_similarities = [
        util.cos_sim(emb_r[i], emb_l[i]).item()
        for i in range(min(len(resume_texts), len(job_texts)))
    ]

    return pairwise_similarities


In [None]:
tfidf_cosine       = compute_tfidf_cosine(resume_texts, job_texts)
jaccard            = compute_jaccard([set(r) for r in l3], [set(j) for j in l4])
pairwise_similarities = compute_sentencebert_cosine(resume_texts, job_texts)
features_df = pd.DataFrame()
features_df["tfidf_cosine"] = tfidf_cosine
features_df["jaccard"] = jaccard
features_df["bert_sim"] = pairwise_similarities



### Skill Overlap Features
The `skill_features` function computes row-wise skill-based features between tokenized resumes (`l3`) and job descriptions (`l4`). It calculates the number of overlapping skills, the percentage of required skills present, and the count of missing skills for each resume–job pair.


In [None]:
def compute_skill_features(l3, l4):
    def skill_features(resume_tokens, job_tokens):
        rset = set(resume_tokens or [])
        jset = set(job_tokens or [])

        overlap = rset & jset
        overlap_count = len(overlap)

        job_count = len(jset)
        percent_required_present = overlap_count / job_count if job_count > 0 else 0.0

        missing_skill_count = job_count - overlap_count

        return overlap_count, percent_required_present, missing_skill_count

    return [skill_features(r, j) for r, j in zip(l3, l4)]


overlap_count, percent_required_present, missing_skill_count = zip(*compute_skill_features(l3, l4))




In [None]:

features_df["overlap_count"]= overlap_count
features_df["percent_required_present"]= percent_required_present
features_df["missing_skill_count"]= missing_skill_count

### Structural Text Features
This code extracts structural features from tokenized resumes (`l3`) and job descriptions (`l4`). Features include the lengths of resumes and jobs, the overlap ratio of unique tokens, and the difference in token counts, which are added to `features_df` for further analysis.


In [None]:
def compute_structural_features(l3, l4):
    resume_length    = [len(r) for r in l3]
    job_length       = [len(j) for j in l4]
    overlap_ratio    = [
        len(set(r) & set(j)) / len(set(r) | set(j)) if len(set(r) | set(j)) > 0 else 0
        for r, j in zip(l3, l4)
    ]
    token_count_diff = [abs(len(r) - len(j)) for r, j in zip(l3, l4)]
    return resume_length, job_length, overlap_ratio, token_count_diff

resume_length, job_length, overlap_ratio, token_count_diff = compute_structural_features(l3, l4)


In [None]:


features_df["resume_length"]= resume_length
features_df["job_length"]= job_length
features_df["overlap_ratio"]= overlap_ratio
features_df["token_count_diff"]=token_count_diff

# now we have implemented ml model on ml_implementation.ipynb