In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_excel('Resumes (For Applicants).xlsx')

Some rows have these two columns swapped.
Looking at the Excel file, rows that have been swapped are those where **Reason_for_decision** starts with "expected_experience".
As such, we check for rows that start like that, and swap them with the **Job_Description** column.

In [None]:
swapped_indices = []

for idx, row in df.iterrows():
    reason_col = str(row['Reason_for_decision']).strip().lower()
    if reason_col.startswith('expected_experience'):
        swapped_indices.append(idx)

if len(swapped_indices) > 0:
    for idx in swapped_indices:
        df.loc[idx, 'Reason_for_decision'], df.loc[idx, 'Job_Description'] = df.loc[idx, 'Job_Description'], df.loc[idx, 'Reason_for_decision']

Convert all data to lowercase as part of preprocessing of data.

In [None]:
text_columns = df.select_dtypes(include=['object', 'string']).columns

for col in text_columns:
    df[col] = df[col].str.lower()

In [None]:
def get_email(text):
    """Gets email in Resume column using regex."""
    email = re.findall(r'\S+@\S+', str(text))
    return email[0] if email else "Not Found"

def get_phone(text):
    """Gets phone number following American format: 123-456-7890 or (123) 456-7890"""
    phone = re.findall(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4})', str(text))
    return phone[0] if phone else "Not Found"

def get_name(text):
    """Gets name from resume column.

    Names appear in specific patterns at the start of resumes:
        1. **candidate profile: Name** or **data scientist candidate profile: Name**
        2. Here's a sample/professional resume for Name
        3. **Name** (just name in bold)
        4. Plain name on first line
    """
    
    lines = str(text).split('\n')
    
    for line in lines:
        line_stripped = line.strip()
        if not line_stripped:
            continue
            
        # Pattern 1: **candidate profile: Name** or **data scientist candidate profile: Name**
        match = re.search(r'\*\*.*?candidate profile:\s*(.+?)\*\*', line_stripped, re.IGNORECASE)
        if match:
            return match.group(1).strip()
        
        # Pattern 2: Here's a sample/professional resume for Name
        match = re.search(r"here'?s?\s+a\s+(sample|professional)\s+resume\s+for\s+(.+)", line_stripped, re.IGNORECASE)
        if match:
            return match.group(2).strip()
        
        # Pattern 3: **Name** (just name in bold, no other text)
        match = re.search(r'^\*\*(.+?)\*\*$', line_stripped)
        if match:
            name = match.group(1).strip()
            # Make sure it's not a profile pattern
            if 'candidate profile' not in name.lower():
                return name
        
        # Pattern 4: Plain name (first non-empty line that doesn't match above)
        # Limit line to less than 50 characters because names are not long.
        if len(line_stripped) < 50 and not line_stripped.endswith('.'):
            return line_stripped
    
    # If no text found, return "Not Found"
    return "Not Found"

def get_education(text):
    """
    Extract education information based on the structured format:
    - Looks for "education:" or "**education:**" line (lowercase since preprocessing lowercases all text)
    - Gets the next non-empty line which contains the degree
    - Format: * bachelor of science in computer science, xyz university (2010-2014)
    - Returns only the degree part (before the comma), without university name
    Note: All text is already lowercased during preprocessing
    """
    lines = str(text).split('\n')
    found_education_header = False
    
    for i, line in enumerate(lines):
        line_stripped = line.strip()
        
        # Look for Education header (education: or **education:**) - all lowercase since preprocessing
        if not found_education_header:
            # Check if this line is the Education header (case-insensitive for safety)
            if re.match(r'^\*{0,2}education:\*{0,2}$', line_stripped, re.IGNORECASE):
                found_education_header = True
                continue
        else:
            # We found the Education header, now look for the next non-empty line
            if line_stripped:
                # Clean the line: remove leading asterisks and bold markers
                cleaned = re.sub(r'^\*+\s*', '', line_stripped)  # Remove leading asterisks
                cleaned = re.sub(r'^\*\*(.+?)\*\*', r'\1', cleaned)  # Remove bold markers
                
                # Extract degree (everything before the comma, which separates from university)
                # Format: bachelor of science in computer science, xyz university (2010-2014)
                # We want: bachelor of science in computer science
                if ',' in cleaned:
                    degree = cleaned.split(',')[0].strip()
                    return degree
                else:
                    # If no comma, return the whole line (some might not have university)
                    return cleaned
    
    # If no education found, return "Not Found"
    return "Not Found"

# This function extracts work experience (in years if mentioned)
def get_experience(text):
    # Pattern to find "X years" or "X+ years" or "X-Y years"
    # Example: "5 years of experience" or "3+ years in Python"
    exp = re.findall(r'(\d+)\+?\s*(?:-\s*\d+)?\s*years?', str(text).lower())
    if exp:
        # Convert the found number to integer and return it
        # If multiple years mentioned, take the first one
        return int(exp[0])
    else:
        # If no years mentioned, return 0
        return 0

# This function finds skills mentioned in the resume
def get_skills(text):
    # Define a comprehensive list of technical skills to look for
    # These are common skills in tech jobs
    keywords = [
        'python', 'java', 'javascript', 'sql', 'r', 'c++', 'c#',
        'machine learning', 'deep learning', 'data analysis', 'statistics',
        'tableau', 'power bi', 'excel', 'aws', 'azure', 'cloud',
        'git', 'docker', 'kubernetes', 'tensorflow', 'pytorch',
        'html', 'css', 'react', 'angular', 'node.js', 'flask', 'django'
    ]
    # Convert resume text to lowercase for easier matching
    text_lower = str(text).lower()
    # Check each skill keyword and keep the ones found in the resume
    # Use regex word boundaries (\b) to match whole words only, preventing 'r' from matching 'ready'
    found = [word for word in keywords if re.search(r'\b' + re.escape(word) + r'\b', text_lower)]
    # Join all found skills with commas, or return "None" if no skills found
    return ", ".join(found) if found else "None"

# %% [4] Apply Extraction to All Resumes
# Now we apply each extraction function to every resume in our dataset
# This creates new columns in our table (dataframe) with the extracted info

# Extract email from each resume and create a new 'Email' column
df['Email'] = df['Resume'].apply(get_email)

# Extract phone number from each resume and create a new 'Phone' column
df['Phone'] = df['Resume'].apply(get_phone)

# Extract name from each resume and create a new 'Name' column
df['Name'] = df['Resume'].apply(get_name)

# Extract education from each resume and create a new 'Education' column
df['Education'] = df['Resume'].apply(get_education)

# Extract years of experience from each resume
df['Experience_Years'] = df['Resume'].apply(get_experience)

# Extract skills from each resume and create a new 'Skills' column
df['Skills'] = df['Resume'].apply(get_skills)

## TASK 2

In [None]:
# Which jobs are most frequently applied for?
plt.figure(figsize=(12,6))

# Count how many applicants applied for each role and create a bar chart
# value_counts() counts occurrences of each unique role
# plot(kind='bar') makes a bar chart
df['Role'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')

# Add a title to explain what this chart shows
plt.title("Number of Applicants per Role", fontsize=16, fontweight='bold')

# Label the x-axis (horizontal) and y-axis (vertical)
plt.xlabel("Job Role", fontsize=12)
plt.ylabel("Number of Applicants", fontsize=12)

# Rotate the x-axis labels by 45 degrees so they don't overlap
plt.xticks(rotation=45, ha='right')

# Add a grid in the background to make values easier to read
plt.grid(axis='y', alpha=0.3)

# tight_layout() ensures nothing gets cut off
plt.tight_layout()

# Display the chart
plt.show()

In [None]:
def clean_education(text):
    """
    Cleans raw education text and extracts the main field of study.
    Returns standardized category names for easier analysis.
    """
    if pd.isna(text) or text == "Not Found" or not text:
        return "Not Found"
    
    # Convert to lowercase for easier pattern matching
    text_lower = str(text).lower()
    
    # Define patterns for common fields of study
    # Check in order of specificity (most specific first)
    
    if any(word in text_lower for word in ['computer science', 'cs', 'computer engineering', 'computing']):
        return "Computer Science"
    elif any(word in text_lower for word in ['data science', 'data analytics', 'data engineering']):
        return "Data Science"
    elif any(word in text_lower for word in ['information technology', 'it', 'information systems']):
        return "Information Technology"
    elif any(word in text_lower for word in ['software engineering', 'software development']):
        return "Software Engineering"
    elif any(word in text_lower for word in ['electrical engineering', 'electronics', 'ece']):
        return "Electrical Engineering"
    elif any(word in text_lower for word in ['mechanical engineering', 'manufacturing']):
        return "Mechanical Engineering"
    elif any(word in text_lower for word in ['civil engineering']):
        return "Civil Engineering"
    elif 'engineering' in text_lower:
        return "Engineering (Other)"
    elif any(word in text_lower for word in ['business administration', 'mba', 'business management']):
        return "Business Administration"
    elif any(word in text_lower for word in ['finance', 'accounting', 'economics']):
        return "Finance/Economics"
    elif any(word in text_lower for word in ['marketing', 'sales']):
        return "Marketing"
    elif any(word in text_lower for word in ['mathematics', 'statistics', 'math']):
        return "Mathematics/Statistics"
    elif any(word in text_lower for word in ['physics', 'chemistry', 'biology']):
        return "Natural Sciences"
    elif any(word in text_lower for word in ['arts', 'humanities', 'literature', 'history']):
        return "Arts/Humanities"
    elif any(word in text_lower for word in ['bachelor', 'master', 'phd', 'degree', 'diploma']):
        return "Other Degree"
    else:
        return "Other"

print("âœ“ Education cleaning function defined")
print("This function will categorize education backgrounds into standard fields")

df['Education_Clean'] = df['Education'].apply(clean_education)

In [None]:
plt.figure(figsize=(12,6))

# Count the top 15 most common education entries (using cleaned data)
# We use .head(15) to get only the top 15 to avoid cluttering
# Filter out "Not Found" entries for cleaner visualization
education_counts = df[df['Education_Clean'] != 'Not Found']['Education_Clean'].value_counts().head(15)

# Create a horizontal bar chart (barh means horizontal bars)
# Horizontal is better when we have long text labels
education_counts.plot(kind='barh', color='lightgreen', edgecolor='black')

# Add title and labels
plt.title("Top 15 Most Common Education Backgrounds (By Degree Field)", fontsize=16, fontweight='bold')
plt.xlabel("Number of Applicants", fontsize=12)
plt.ylabel("Education Background", fontsize=12)

# Add grid for easier reading
plt.grid(axis='x', alpha=0.3)

# Adjust layout and show
plt.tight_layout()
plt.show()

In [None]:
# Distribution of Experience Levels
# This shows the typical profile of candidates based on their experience

# Create a figure with better size
plt.figure(figsize=(10,6))

# Create a histogram showing distribution of experience years
# bins=20 means we divide the data into 20 groups
# A histogram shows how many people fall into each experience range
plt.hist(df['Experience_Years'], bins=20, color='coral', edgecolor='black', alpha=0.7)

# Add title and labels
plt.title("Distribution of Candidate Experience Levels", fontsize=16, fontweight='bold')
plt.xlabel("Years of Experience", fontsize=12)
plt.ylabel("Number of Applicants", fontsize=12)

# Add a grid for easier reading
plt.grid(axis='y', alpha=0.3)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# Skills Distribution Analysis
# Count how many skills each person has (we'll use this throughout the analysis)
df['Skill_Count'] = df['Skills'].apply(lambda x: len(str(x).split(',')) if x != 'None' else 0)

# Create a histogram showing distribution of skill counts
plt.figure(figsize=(10,6))
plt.hist(df['Skill_Count'], bins=15, color='skyblue', edgecolor='black', alpha=0.7)

# Add title and labels
plt.title("Distribution of Technical Skill Diversity Among Candidates", fontsize=16, fontweight='bold')
plt.xlabel("Number of Skills", fontsize=12)
plt.ylabel("Number of Applicants", fontsize=12)

# Add a grid for easier reading
plt.grid(axis='y', alpha=0.3)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# Decision Patterns by Experience Level
# Create experience level categories for better analysis
df['Experience_Level'] = pd.cut(df['Experience_Years'], 
                                bins=[0, 2, 5, 10, 50],
                                labels=['Entry (0-2 yrs)', 'Mid (3-5 yrs)', 
                                       'Senior (6-10 yrs)', 'Expert (10+ yrs)'])

# Create a grouped analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left: Decision by Experience Level
exp_decision = pd.crosstab(df['Experience_Level'], df['Decision'], normalize='index') * 100
exp_decision.plot(kind='bar', ax=axes[0], color=['lightcoral', 'lightgreen'], edgecolor='black')
axes[0].set_title("Selection Rate by Experience Level", fontsize=14, fontweight='bold')
axes[0].set_xlabel("Experience Level", fontsize=12)
axes[0].set_ylabel("Percentage (%)", fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')
axes[0].legend(title="Decision", labels=['Reject', 'Select'])
axes[0].grid(axis='y', alpha=0.3)

# Add percentage labels on bars
for container in axes[0].containers:
    axes[0].bar_label(container, fmt='%.1f%%', fontsize=9)

# Right: Decision by Skill Count bins
df['Skill_Level'] = pd.cut(df['Skill_Count'],
                           bins=[0, 3, 6, 9, 30],
                           labels=['Few (1-3)', 'Moderate (4-6)', 'Many (7-9)', 'Expert (10+)'])

skill_decision = pd.crosstab(df['Skill_Level'], df['Decision'], normalize='index') * 100
skill_decision.plot(kind='bar', ax=axes[1], color=['lightcoral', 'lightgreen'], edgecolor='black')
axes[1].set_title("Selection Rate by Skill Count", fontsize=14, fontweight='bold')
axes[1].set_xlabel("Skill Level", fontsize=12)
axes[1].set_ylabel("Percentage (%)", fontsize=12)
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
axes[1].legend(title="Decision", labels=['Reject', 'Select'])
axes[1].grid(axis='y', alpha=0.3)

# Add percentage labels
for container in axes[1].containers:
    axes[1].bar_label(container, fmt='%.1f%%', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Patterns in Hired vs Rejected Candidates

# Create a figure with two subplots side by side
# subplot(1, 2, 1) means: 1 row, 2 columns, this is position 1
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left chart: Decision distribution (how many selected vs rejected)
# value_counts() counts how many "Select" and how many "Reject"
df['Decision'].value_counts().plot(kind='pie', ax=axes[0], autopct='%1.1f%%', 
                                    colors=['lightgreen', 'lightcoral'],
                                    startangle=90)
# autopct='%1.1f%%' means show percentages with 1 decimal place
# startangle=90 means start the pie from the top

# Add title to the left chart
axes[0].set_title("Overall Hiring Decision Distribution", fontsize=14, fontweight='bold')
# Remove the y-label (pie charts don't need them)
axes[0].set_ylabel('')

# Right chart: Decision by Role
# crosstab creates a table showing how many selected/rejected for each role
decision_by_role = pd.crosstab(df['Role'], df['Decision'])
# Plot as a stacked bar chart
decision_by_role.plot(kind='bar', ax=axes[1], color=['lightcoral', 'lightgreen'])

# Add title and labels to the right chart
axes[1].set_title("Hiring Decisions by Role", fontsize=14, fontweight='bold')
axes[1].set_xlabel("Job Role", fontsize=12)
axes[1].set_ylabel("Number of Applicants", fontsize=12)
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
axes[1].legend(title="Decision")
axes[1].grid(axis='y', alpha=0.3)

# Adjust layout so nothing overlaps
plt.tight_layout()
plt.show()

In [None]:
# Skill Diversity vs. Experience (Scatter Plot with Regression Line)
# Do people with more experience have more diverse skills?
# If the line slopes upward, more experienced candidates are "upskilling"
# If it's flat, skillsets might plateau after a certain career stage

# Note: Skill_Count was already calculated in Cell 7b, so we can use it directly

# Create a scatter plot (dots) showing experience vs number of skills
plt.figure(figsize=(12, 7))

# Each dot represents one candidate
# x-axis is their experience, y-axis is their number of skills
# alpha=0.5 makes dots semi-transparent so we can see overlapping points
plt.scatter(df['Experience_Years'], df['Skill_Count'], alpha=0.5, 
            color='purple', edgecolors='black', s=60)

# Add a regression line to see if there's a relationship
# polyfit finds the best line through the data (degree 1 = straight line)
z = np.polyfit(df['Experience_Years'], df['Skill_Count'], 1)
p = np.poly1d(z)
slope = z[0]  # Extract the slope of the line

# Plot the regression line in red with dashes
plt.plot(df['Experience_Years'], p(df['Experience_Years']), 
         "r--", linewidth=3, label=f'Regression Line (slope={slope:.3f})')

# Add title and labels
plt.title("Skill Diversity vs. Experience: Are Candidates Upskilling?", 
          fontsize=16, fontweight='bold')
plt.xlabel("Years of Experience", fontsize=13)
plt.ylabel("Number of Unique Skills Identified", fontsize=13)
plt.legend(fontsize=11)
plt.grid(alpha=0.3)

# Add annotations
if slope > 0.1:
    plt.text(0.5, 0.95, 'Positive Trend: More experience -> More skills', 
             transform=plt.gca().transAxes, fontsize=11, 
             bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.8),
             verticalalignment='top')
elif slope < -0.1:
    plt.text(0.5, 0.95, 'Negative Trend: More experience -> Fewer unique skills listed', 
             transform=plt.gca().transAxes, fontsize=11, 
             bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8),
             verticalalignment='top')
else:
    plt.text(0.5, 0.95, 'Flat Trend: Experience and skills not strongly correlated', 
             transform=plt.gca().transAxes, fontsize=11, 
             bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8),
             verticalalignment='top')

plt.tight_layout()
plt.show()

In [None]:
# Top Skills in Demand
# What are the most commonly mentioned skills across all resumes?

# Create an empty list to collect all skills
all_skills = []

# Go through each resume's skills
for skills in df['Skills']:
    # If skills exist (not "None")
    if skills != 'None':
        # Split by comma to get individual skills
        # strip() removes extra spaces
        skill_list = [s.strip() for s in str(skills).split(',')]
        # Add these skills to our master list
        all_skills.extend(skill_list)

# Count how often each skill appears
# pd.Series converts our list to a pandas object so we can use value_counts()
skill_counts = pd.Series(all_skills).value_counts().head(15)

# Create a bar chart of top skills
plt.figure(figsize=(12, 6))
skill_counts.plot(kind='barh', color='gold', edgecolor='black')

# Add title and labels
plt.title("Top 15 Most In-Demand Technical Skills Across All Resumes", fontsize=16, fontweight='bold')
plt.xlabel("Number of Resumes Mentioning This Skill", fontsize=12)
plt.ylabel("Skill", fontsize=12)
plt.grid(axis='x', alpha=0.3)

# Adjust and display
plt.tight_layout()
plt.show()

In [None]:
# Decision Heatmap: Skills vs. Selection
# Which skills lead to higher selection rates?
# This heatmap reveals "must-have" skills that correlate with getting hired

# Get top 15 most common skills
top_15_skills = pd.Series(all_skills).value_counts().head(15).index.tolist()

# Create a dataframe to store selection rates for each skill
skill_selection_data = []

for skill in top_15_skills:
    # Check if this skill appears in the Skills column
    # Split each row's skills and check for exact match (case-insensitive)
    def has_this_skill(skills_text):
        if skills_text == 'None' or pd.isna(skills_text):
            return False
        # Split by comma and check each skill
        skill_items = [s.strip().lower() for s in str(skills_text).split(',')]
        return skill.lower() in skill_items
    
    has_skill = df['Skills'].apply(has_this_skill)
    
    # Among those with this skill, how many were selected?
    total_with_skill = has_skill.sum()
    
    if total_with_skill > 0:
        selected_with_skill = (has_skill & (df['Decision'] == 'select')).sum()
        rejected_with_skill = (has_skill & (df['Decision'] == 'reject')).sum()
        
        selection_rate = (selected_with_skill / total_with_skill) * 100
        rejection_rate = (rejected_with_skill / total_with_skill) * 100
        
        skill_selection_data.append({
            'Skill': skill.title(),
            'Select': selection_rate,
            'Reject': rejection_rate,
            'Total': total_with_skill,
            'Selected_Count': selected_with_skill,
            'Rejected_Count': rejected_with_skill
        })

# Create DataFrame
skill_df = pd.DataFrame(skill_selection_data)

# Prepare data for heatmap (transpose so skills are on x-axis)
heatmap_data = skill_df[['Skill', 'Select', 'Reject']].set_index('Skill').T

# Create the heatmap
plt.figure(figsize=(16, 6))

# Use RdYlGn colormap (Red-Yellow-Green) where green = high selection rate
sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn', 
            cbar_kws={'label': 'Percentage (%)'}, 
            linewidths=1, linecolor='white',
            vmin=0, vmax=100)

plt.title('Decision Heatmap: Skills vs. Selection Rate\n(Green = High Selection | Red = Low Selection)', 
            fontsize=16, fontweight='bold')
plt.xlabel('Top 15 Skills', fontsize=13)
plt.ylabel('Hiring Decision', fontsize=13)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Analyze reasons by decision type
selected = df[df['Decision'] == 'select']['Reason_for_decision']
rejected = df[df['Decision'] == 'reject']['Reason_for_decision']

# Define categories and their keywords to extract patterns
reason_categories = {
    'Experience Issues': [
        'lack', 'lacks', 'insufficient', 'limited', 'no experience', 
        'expected_experience', 'not enough', 'minimal', 'inadequate experience'
    ],
    'Technical Skills': [
        'technical skills', 'strong technical', 'proficient', 'expertise',
        'proficiency', 'skilled in', 'advanced knowledge'
    ],
    'Cloud/DevOps': [
        'cloud', 'aws', 'azure', 'kubernetes', 'docker', 'devops',
        'cloud platforms', 'infrastructure'
    ],
    'Cultural Fit': [
        'cultural fit', 'culture', 'team fit', 'alignment', 'values'
    ],
    'Communication/Soft Skills': [
        'communication', 'soft skills', 'interpersonal', 'collaboration',
        'teamwork', 'presentation'
    ],
    'Leadership/Management': [
        'leadership', 'management', 'lead', 'mentor', 'senior', 'strategic'
    ],
    'System Design/Architecture': [
        'system design', 'architecture', 'design patterns', 'scalability',
        'architectural'
    ],
    'Domain Knowledge': [
        'domain', 'industry', 'business knowledge', 'sector'
    ],
    'Full-Stack/Development': [
        'full-stack', 'full stack', 'front-end', 'back-end', 
        'frontend', 'backend', 'development experience'
    ],
    'Data/Analytics': [
        'data analysis', 'analytics', 'data science', 'machine learning',
        'ai', 'ml', 'statistical'
    ]
}

# Count occurrences for rejected candidates
rejection_pattern_counts = {}
for category, keywords in reason_categories.items():
    count = 0
    for keyword in keywords:
        count += rejected.str.lower().str.contains(keyword, regex=False, na=False).sum()
    if count > 0:
        rejection_pattern_counts[category] = count

# Count occurrences for selected candidates
selection_pattern_counts = {}
for category, keywords in reason_categories.items():
    count = 0
    for keyword in keywords:
        count += selected.str.lower().str.contains(keyword, regex=False, na=False).sum()
    if count > 0:
        selection_pattern_counts[category] = count

# Visualize patterns in hiring/rejection reasons
fig, ax = plt.subplots(figsize=(14, 8))

# Prepare data - get top categories from both
all_categories = set(list(rejection_pattern_counts.keys()) + list(selection_pattern_counts.keys()))
categories_list = sorted(all_categories)

# Calculate percentages for comparison
reject_pcts = []
select_pcts = []
for cat in categories_list:
    reject_count = rejection_pattern_counts.get(cat, 0)
    select_count = selection_pattern_counts.get(cat, 0)
    reject_pcts.append((reject_count / len(rejected)) * 100)
    select_pcts.append((select_count / len(selected)) * 100)

# Create grouped bar chart
x = np.arange(len(categories_list))
width = 0.35

bars1 = ax.barh(x - width/2, reject_pcts, width, label='Rejected', color='#E74C3C', alpha=0.8)
bars2 = ax.barh(x + width/2, select_pcts, width, label='Selected', color='#2ECC71', alpha=0.8)

ax.set_xlabel('Percentage of Candidates', fontsize=12, fontweight='bold')
ax.set_ylabel('Reason Category', fontsize=12, fontweight='bold')
ax.set_title('Hiring Decision Patterns: What Matters for Selection vs Rejection\n(Based on "Reason_for_decision" text analysis)', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_yticks(x)
ax.set_yticklabels(categories_list)
ax.legend(loc='lower right', fontsize=11)
ax.grid(axis='x', alpha=0.3)

# Add percentage labels
for bars in [bars1, bars2]:
    for bar in bars:
        width_val = bar.get_width()
        if width_val > 1:  # Only show label if > 1%
            ax.text(width_val + 0.5, bar.get_y() + bar.get_height()/2, 
                   f'{width_val:.1f}%', va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

## TASK 3

In [None]:
# - First, we convert extracted data (Education, Experience, Skills) into numbers
# - Then, we build: Model 1 (Role Classifier) + Model 2 (Role-Specific Models)

# 1. (Convert Structured Data to Numbers)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Experience Years
# This is already a number (0-15 years), but we need to normalize it
# Normalization puts all features on a similar scale so one doesn't dominate
X_experience = df['Experience_Years'].values.reshape(-1, 1)  # Reshape to column vector
# StandardScaler makes the average=0 and standard deviation=1
# Example: 5 years might become 0.2, 10 years becomes 1.5, etc.
experience_scaler = StandardScaler()
X_experience = experience_scaler.fit_transform(X_experience)

# Education Level
# Education is text like "bachelor of science", "master of arts", "phd"
# We convert this to numbers using LabelEncoder
# LabelEncoder assigns each unique education level a number
education_encoder = LabelEncoder()
X_education = education_encoder.fit_transform(df['Education']).reshape(-1, 1)

# Individual Skills (One feature per skill)
# Skills is like "python, sql, react" - we want to create separate features
# for each skill: has_python=1/0, has_sql=1/0, has_react=1/0
# CountVectorizer can do this if we treat skills like a document
# Create a custom tokenizer that splits on commas
# This treats "python, sql, react" as three separate tokens
def skill_tokenizer(text):
    # Split by comma, strip whitespace, and lowercase
    if text == 'None' or pd.isna(text):
        return []
    return [skill.strip().lower() for skill in str(text).split(',')]

# CountVectorizer with binary=True creates 0/1 features for each skill
# binary=True means we only care if skill EXISTS (1) or NOT (0), not how many times
skills_vectorizer = CountVectorizer(
    tokenizer=skill_tokenizer,
    lowercase=False,  # Already lowercased in tokenizer
    token_pattern=None,  # Explicitly set to None since we're using a custom tokenizer
    binary=True       # Each skill is just 0 (absent) or 1 (present)
)

X_skills = skills_vectorizer.fit_transform(df['Skills'])
skill_names = skills_vectorizer.get_feature_names_out()

# COMBINE ALL FEATURES
# Stack all features together horizontally (side by side)
# Final result: [1 experience | 1 education | N skills] features
# Convert numpy arrays to sparse matrices so they match X_skills format
X_experience_sparse = csr_matrix(X_experience)
X_education_sparse = csr_matrix(X_education)

# Stack everything together: [Experience | Education | Skills]
X = hstack([X_experience_sparse, X_education_sparse, X_skills])

# 2. TRAIN ROLE CLASSIFIER (Which role fits best?)
y_role = df['Role']

# Split data into training and testing sets
# Think of it like: 80% for teaching, 20% for the final exam
# random_state=42 ensures we get the same split every time (reproducibility)
# stratify ensures each role appears proportionally in both train and test sets
X_train_role, X_test_role, y_train_role, y_test_role = train_test_split(
    X,                          # All features (Education, Experience, Skills)
    y_role,                     # All role labels
    test_size=0.2,              # 20% for testing, 80% for training
    random_state=42,            # Makes results reproducible
    stratify=y_role             # Keep role proportions balanced
)

# Random Forest is like having many decision trees vote together
# Each tree looks at different patterns, then they vote on the answer
# Create the model with specific settings:
role_classifier = RandomForestClassifier(
    n_estimators=200,          # Use 200 trees (more trees = more accurate, but slower)
    max_depth=30,              # Each tree can be 30 levels deep (prevents overcomplicating)
    min_samples_split=5,       # Need at least 5 resumes to make a split decision
    min_samples_leaf=2,        # Each final decision needs at least 2 resumes
    random_state=42,           # Makes results reproducible
    class_weight='balanced'    # Give equal importance to all roles (even if some have few examples)
)

# Train the model (this is where the learning happens!)
# fit means: "learn the patterns from training data"
role_classifier.fit(X_train_role, y_train_role)

# Test the model on data it hasn't seen before
# predict means: "make your best guess for these resumes"
role_predictions = role_classifier.predict(X_test_role)

# Calculate accuracy: How many did we get right?
# accuracy_score compares predictions to actual answers
role_accuracy = accuracy_score(y_test_role, role_predictions)

# 3: TRAIN ROLE-SPECIFIC SUITABILITY MODELS
# For each role, train a model that knows what 'qualified' means

# Dictionary to store all role-specific models
# Think of it as a filing cabinet where each role has its own trained model
role_specific_models = {}
# Dictionary to store accuracy for each model (for reporting)
role_model_accuracies = {}
# Get list of all unique roles in our data
all_roles = sorted(df['Role'].unique())

# Train one model for each role
for role in all_roles:
    role_mask = df['Role'] == role
    role_df = df[role_mask]
    
    # Count how many select vs reject for this role
    select_count = (role_df['Decision'] == 'select').sum()
    reject_count = (role_df['Decision'] == 'reject').sum()
    total_count = len(role_df)
    
    # Skip if too few examples (need at least 10 to train properly)
    if total_count < 10:
        continue
    
    # Get the features (X) and labels (y) for this role only
    # IMPORTANT: Use .values to convert pandas Series to numpy array for sparse matrix indexing
    X_role = X[role_mask.values]           # Features for this role (Education, Experience, Skills)
    y_role_decision = df.loc[role_mask, 'Decision']  # Select/Reject decisions for this role
    
    # Split into training and testing sets
    # Use 20% for testing, 80% for training
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X_role,
            y_role_decision,
            test_size=0.2,          # 20% for testing
            random_state=42,
            stratify=y_role_decision  # Keep select/reject proportions balanced
        )
    except ValueError:
        # If we can't split (e.g., only 1 class), skip this role
        continue
    
    # Train the model for this specific role
    # This model learns what makes someone qualified for THIS role
    model = RandomForestClassifier(
        n_estimators=100,          # Use 100 trees (fewer than role classifier since less data)
        max_depth=20,              # Maximum depth of 20 (simpler than role classifier)
        min_samples_split=5,       # Need 5 samples to split
        min_samples_leaf=2,        # Need 2 samples in final decision
        random_state=42,
        class_weight='balanced'    # Handle imbalanced data (more selects or rejects)
    )
    
    # Train the model on this role's data
    model.fit(X_train, y_train)
    
    # Test the model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    # Store the trained model and its accuracy
    role_specific_models[role] = model
    role_model_accuracies[role] = accuracy

In [None]:
# Model Comparison - Testing Different Algorithms for Role Classification
# Compare Random Forest with other ML algorithms for ROLE prediction
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

role_models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=5000, random_state=42, solver='saga'),
    'Support Vector Machine': SVC(kernel='linear', random_state=42, probability=True)
}

role_results = {}

for name, clf in role_models.items():
    clf.fit(X_train_role, y_train_role)
    pred = clf.predict(X_test_role)
    acc = accuracy_score(y_test_role, pred)
    role_results[name] = acc

# Calculate average accuracy of role-specific models
avg_role_specific_accuracy = np.mean(list(role_model_accuracies.values()))

# Visualize Model Comparisons
fig, ax = plt.subplots(figsize=(10, 6))

# Role Classifier Comparison
model_names = list(role_results.keys())
accuracies = list(role_results.values())

ax.bar(model_names, accuracies, color=['skyblue', 'lightcoral', 'lightgreen'],
        edgecolor='black', alpha=0.8)
ax.set_title('Role Classifier - Algorithm Accuracy Comparison', fontsize=14, fontweight='bold')
ax.set_ylabel('Accuracy (%)', fontsize=12)
ax.set_ylim(0, 1.0)
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0%', '20%', '40%', '60%', '80%', '100%'])
ax.axhline(y=0.8, color='red', linestyle='--', linewidth=2, alpha=0.5)
ax.grid(axis='y', alpha=0.3)

for i, (name, acc) in enumerate(role_results.items()):
    ax.text(i, acc, f'{acc*100:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Save models

In [None]:
# Save Models for Production Deployment
# We need to save: 1 role classifier + multiple role-specific models

import joblib
import pickle
import os

os.makedirs('artifacts', exist_ok=True)

# Role Classifier model predicts which job role best fits a resume
joblib.dump(role_classifier, 'artifacts/role_classifier.pkl')

# Role-specifi model knows how to evaluate candidates for its specific role
joblib.dump(role_specific_models, 'artifacts/role_specific_models.pkl')

# Transformers used to convert structured data to numbers
joblib.dump(experience_scaler, 'artifacts/experience_scaler.pkl')
joblib.dump(education_encoder, 'artifacts/education_encoder.pkl')
joblib.dump(skills_vectorizer, 'artifacts/skills_vectorizer.pkl')