# Resume Screening System - Exploratory Data Analysis

This notebook explores resume and job description datasets, analyzing text characteristics and patterns.

## 1. Import Required Libraries

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

from resume_screening.preprocessor import TextPreprocessor
from resume_screening.data_loader import DataLoader, SyntheticDataGenerator, JobScraper

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 2. Generate Synthetic Dataset for Analysis

In [None]:
# Generate synthetic resume-job pairs for analysis
resumes, jobs, labels = SyntheticDataGenerator.generate_matched_pairs(n_pairs=50)

# Create DataFrame
df = pd.DataFrame({
    'resume': resumes,
    'job_description': jobs,
    'label': labels
})

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nFirst few rows:")
df.head()

## 3. Analyze Text Characteristics

In [None]:
# Calculate text statistics
df['resume_length'] = df['resume'].str.len()
df['job_length'] = df['job_description'].str.len()
df['resume_words'] = df['resume'].str.split().str.len()
df['job_words'] = df['job_description'].str.split().str.len()

print("Resume Statistics:")
print(df['resume_length'].describe())
print(f"\nJob Description Statistics:")
print(df['job_length'].describe())

# Visualize text lengths
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['resume_words'], bins=30, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Number of Words')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Resume Length Distribution')

axes[1].hist(df['job_words'], bins=30, color='lightcoral', edgecolor='black')
axes[1].set_xlabel('Number of Words')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Job Description Length Distribution')

plt.tight_layout()
plt.show()

print(f"\nAverage resume words: {df['resume_words'].mean():.2f}")
print(f"Average job description words: {df['job_words'].mean():.2f}")

## 4. Preprocess and Tokenize Samples

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(remove_stopwords=True, use_lemmatization=True)

# Preprocess a sample
sample_resume = df['resume'].iloc[0]
sample_job = df['job_description'].iloc[0]

print("Sample Resume (Original):")
print(sample_resume[:200] + "...")

resume_tokens = preprocessor.process(sample_resume)
print(f"\nTokens ({len(resume_tokens)}): {resume_tokens[:15]}...")

print("\n" + "="*50)
print("\nSample Job Description (Original):")
print(sample_job[:200] + "...")

job_tokens = preprocessor.process(sample_job)
print(f"\nTokens ({len(job_tokens)}): {job_tokens[:15]}...")

## 5. Extract Skills and Key Information

In [None]:
# Extract skills from resumes
df['skills_resume'] = df['resume'].apply(lambda x: TextPreprocessor.extract_skills(x))
df['skills_job'] = df['job_description'].apply(lambda x: TextPreprocessor.extract_skills(x))

print("Skills Extracted from Sample Resume:")
print(df['skills_resume'].iloc[0])

print("\nSkills Required in Sample Job:")
print(df['skills_job'].iloc[0])

# Aggregate all skills
all_resume_skills = []
all_job_skills = []

for skills_list in df['skills_resume']:
    all_resume_skills.extend(skills_list)

for skills_list in df['skills_job']:
    all_job_skills.extend(skills_list)

# Count frequencies
resume_skill_counts = Counter(all_resume_skills)
job_skill_counts = Counter(all_job_skills)

print("\nTop 10 Skills in Resumes:")
for skill, count in resume_skill_counts.most_common(10):
    print(f"  {skill}: {count}")

print("\nTop 10 Skills in Job Descriptions:")
for skill, count in job_skill_counts.most_common(10):
    print(f"  {skill}: {count}")

## 6. Vocabulary Analysis

In [None]:
# Preprocess all documents
resume_corpus = []
job_corpus = []

for resume in df['resume']:
    tokens = preprocessor.process(resume)
    resume_corpus.append(tokens)

for job in df['job_description']:
    tokens = preprocessor.process(job)
    job_corpus.append(tokens)

# Calculate vocabulary stats
all_resume_tokens = [token for tokens in resume_corpus for token in tokens]
all_job_tokens = [token for tokens in job_corpus for token in tokens]

resume_vocab = set(all_resume_tokens)
job_vocab = set(all_job_tokens)

print(f"Resume Corpus:")
print(f"  Total tokens: {len(all_resume_tokens)}")
print(f"  Unique tokens: {len(resume_vocab)}")
print(f"  Vocabulary richness: {len(resume_vocab)/len(all_resume_tokens):.4f}")

print(f"\nJob Description Corpus:")
print(f"  Total tokens: {len(all_job_tokens)}")
print(f"  Unique tokens: {len(job_vocab)}")
print(f"  Vocabulary richness: {len(job_vocab)/len(all_job_tokens):.4f}")

print(f"\nCommon tokens: {len(resume_vocab & job_vocab)}")
print(f"Overlap: {len(resume_vocab & job_vocab) / len(resume_vocab | job_vocab):.4f}")

## 7. Most Frequent Words

In [None]:
# Get most common words
resume_word_freq = Counter(all_resume_tokens)
job_word_freq = Counter(all_job_tokens)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Top words in resumes
top_resume_words = resume_word_freq.most_common(15)
words, freqs = zip(*top_resume_words)
axes[0].barh(words, freqs, color='skyblue')
axes[0].set_xlabel('Frequency')
axes[0].set_title('Top 15 Words in Resumes')
axes[0].invert_yaxis()

# Top words in jobs
top_job_words = job_word_freq.most_common(15)
words, freqs = zip(*top_job_words)
axes[1].barh(words, freqs, color='lightcoral')
axes[1].set_xlabel('Frequency')
axes[1].set_title('Top 15 Words in Job Descriptions')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 8. Label-based Analysis

In [None]:
# Compare matched vs non-matched pairs
matched = df[df['label'] == 1]
unmatched = df[df['label'] == 0]

print(f"Matched Pairs (label=1): {len(matched)}")
print(f"  Average resume length: {matched['resume_words'].mean():.2f} words")
print(f"  Average job length: {matched['job_words'].mean():.2f} words")

print(f"\nUnmatched Pairs (label=0): {len(unmatched)}")
print(f"  Average resume length: {unmatched['resume_words'].mean():.2f} words")
print(f"  Average job length: {unmatched['job_words'].mean():.2f} words")

# Skills overlap
matched['skill_overlap'] = matched.apply(
    lambda row: len(set(row['skills_resume']) & set(row['skills_job'])), 
    axis=1
)
unmatched['skill_overlap'] = unmatched.apply(
    lambda row: len(set(row['skills_resume']) & set(row['skills_job'])), 
    axis=1
)

print(f"\nMatched - Average skill overlap: {matched['skill_overlap'].mean():.2f}")
print(f"Unmatched - Average skill overlap: {unmatched['skill_overlap'].mean():.2f}")

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
data_to_plot = [matched['skill_overlap'], unmatched['skill_overlap']]
ax.boxplot(data_to_plot, labels=['Matched', 'Unmatched'])
ax.set_ylabel('Skill Overlap Count')
ax.set_title('Skill Overlap: Matched vs Unmatched Pairs')
plt.show()

## 9. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nDuplicate resumes: {df['resume'].duplicated().sum()}")
print(f"Duplicate job descriptions: {df['job_description'].duplicated().sum()}")

# Check text quality
print(f"\nTexts with very short length (<10 words):")
print(f"  Resumes: {(df['resume_words'] < 10).sum()}")
print(f"  Jobs: {(df['job_words'] < 10).sum()}")

print(f"\nTexts with very long length (>500 words):")
print(f"  Resumes: {(df['resume_words'] > 500).sum()}")
print(f"  Jobs: {(df['job_words'] > 500).sum()}")

## 10. Summary and Next Steps

In [None]:
print("""
EXPLORATORY DATA ANALYSIS SUMMARY
==================================

Dataset Overview:
- Total samples: {} pairs
- Matched: {} ({:.1f}%)
- Unmatched: {} ({:.1f}%)

Text Characteristics:
- Average resume length: {:.0f} words
- Average job length: {:.0f} words
- Common vocabulary: {:.1f}% overlap

Key Findings:
- Skills are strong indicators of resume-job matching
- Longer texts tend to have more diverse vocabulary
- Technical terms appear frequently in both corpus

Next Steps:
1. Move to Notebook 02: Preprocessing & Feature Engineering
2. Implement TF-IDF, Word2Vec, BERT embeddings
3. Create similarity scoring baseline
4. Train classification models for ranking
""".format(
    len(df),
    (df['label']==1).sum(),
    (df['label']==1).sum()/len(df)*100,
    (df['label']==0).sum(),
    (df['label']==0).sum()/len(df)*100,
    df['resume_words'].mean(),
    df['job_words'].mean(),
    len(resume_vocab & job_vocab) / len(resume_vocab | job_vocab) * 100
))