In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Week 2: AI Matching with TF-IDF\n",
    "\n",
    "In this notebook, we build our first AI-powered job-candidate matching system using TF-IDF vectors and cosine similarity. "
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Step 0: Install missing packages automatically\n",
    "import sys\n",
    "import subprocess\n",
    "import importlib\n",
    "\n",
    "def install_if_missing(package):\n",
    "    try:\n",
    "        importlib.import_module(package)\n",
    "    except ImportError:\n",
    "        print(f\"Installing {package}...\")\n",
    "        subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])\n",
    "\n",
    "packages = [\"pandas\", \"scikit-learn\", \"matplotlib\", \"seaborn\"]\n",
    "for pkg in packages:\n",
    "    install_if_missing(pkg)"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Step 1: Imports\n",
    "import pandas as pd\n",
    "import json\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: Load JSON data\n",
    "We load job descriptions and candidate profiles from the `data/` folder."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Load jobs safely\n",
    "jobs_path = \"../data/jobs/sample_jobs.json\"\n",
    "candidates_path = \"../data/candidates/sample_candidates.json\"\n",
    "\n",
    "def load_json(path):\n",
    "    with open(path, 'r') as f:\n",
    "        return json.load(f) if f.readable() else []\n",
    "\n",
    "jobs = load_json(jobs_path)\n",
    "candidates = load_json(candidates_path)\n",
    "\n",
    "# Convert to DataFrames\n",
    "jobs_df = pd.DataFrame(jobs)\n",
    "candidates_df = pd.DataFrame(candidates)\n",
    "\n",
    "jobs_df, candidates_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3: Combine text fields\n",
    "We combine titles, descriptions, and skills for jobs, and summary + skills for candidates into a single text field."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def combine_text(row):\n",
    "    return \" \".join([\n",
    "        row.get(\"title\", \"\"),\n",
    "        row.get(\"description\", \"\"),\n",
    "        \" \".join(row.get(\"skills\", []))\n",
    "    ])\n",
    "\n",
    "jobs_df[\"text\"] = jobs_df.apply(combine_text, axis=1)\n",
    "candidates_df[\"text\"] = candidates_df.apply(lambda r: r.get(\"summary\", \"\") + \" \" + \" \".join(r.get(\"skills\", [])), axis=1)\n",
    "\n",
    "jobs_df[[\"job_id\",\"text\"]], candidates_df[[\"candidate_id\",\"text\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 4: Compute TF-IDF vectors\n",
    "We convert text into numerical vectors that can be compared."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "all_text = list(jobs_df[\"text\"]) + list(candidates_df[\"text\"])\n",
    "vectorizer = TfidfVectorizer()\n",
    "vectorizer.fit(all_text)\n",
    "\n",
    "job_vectors = vectorizer.transform(jobs_df[\"text\"])\n",
    "candidate_vectors = vectorizer.transform(candidates_df[\"text\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 5: Compute Cosine Similarity\n",
    "This gives a score for each job-candidate pair."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "similarity = cosine_similarity(job_vectors, candidate_vectors)\n",
    "similarity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 6: Print Ranked Matches"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "for i, job in jobs_df.iterrows():\n",
    "    print(f\"Job: {job['title']}\")\n",
    "    scores = similarity[i]\n",
    "    for j, cand in candidates_df.iterrows():\n",
    "        print(f\"  Candidate: {cand['candidate_id']}, score: {scores[j]:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 7: Visualize Matches"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "sns.heatmap(similarity, annot=True, xticklabels=candidates_df[\"candidate_id\"], yticklabels=jobs_df[\"job_id\"])\n",
    "plt.title(\"Job-Candidate Similarity\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ✅ Week 2 Summary\n",
    "- Loaded jobs and candidates from JSON\n",
    "- Combined text fields into a single representation\n",
    "- Computed TF-IDF vectors\n",
    "- Calculated cosine similarity\n",
    "- Visualized matches\n",
    "- Notebook installs required packages automatically\n",
    "\n",
    "This notebook is **ready to commit to GitHub** for Week 2."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# Week 2: AI Matching with TF-IDF\n',
    '\n',
    'In this notebook, we build our first AI-powered job-candidate matching system using TF-IDF vectors and cosine similarity. ']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ['# Step 0: Install missing packages automatically\n',
    'import sys\n',
    'import subprocess\n',
    'import importlib\n',
    '\n',
    'def install_if_missing(package):\n',
    '    try:\n',
    '        importlib.import_module(package)\n',
    '    except ImportError:\n',
    '        print(f"Installing {package}...")\n',
    '        subprocess.check_call([sys.executable, "-m", "pip", "install", package])\n',
    '\n',
    'packages = ["pandas", "scikit-learn", "matplotlib", "seaborn"]\n',
    'for pkg in packages:\n',
    '    install_if_missing(pkg)']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ['# Step 1: Imports\n',
    'import pandas as pd\n',
    'import j