In [6]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# 1. Load the downloaded dataset
df = pd.read_csv("Resume/Resume.csv")


In [7]:
# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB


In [9]:
print(f"Dataset loaded with {len(df)} entries and {df['Category'].nunique()} categories.")


Dataset loaded with 2484 entries and 24 categories.


In [10]:
df.Category.value_counts()

Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

In [11]:
# 2. Define a comprehensive JD for each category
category_to_jd = {
    "INFORMATION-TECHNOLOGY": "Seeking an IT professional with expertise in software development, system administration, or network engineering. Proficiency in programming languages (Python, Java, C++), cloud platforms (AWS, Azure), and DevOps tools is required. Strong problem-solving skills and a Bachelor's degree in Computer Science or related field are essential.",
    "BUSINESS-DEVELOPMENT": "Looking for a Business Development manager to identify new growth opportunities and build strategic partnerships. Must have experience in market analysis, sales strategies, negotiation, and client relationship management. Excellent communication and analytical skills. MBA is a plus.",
    "FINANCE": "We need a Finance analyst with experience in financial modeling, investment analysis, and risk management. Proficiency in Excel, Bloomberg Terminal, and statistical software. Strong understanding of financial markets and regulations. CFA candidate or charterholder preferred.",
    "ADVOCATE": "Seeking a qualified Advocate with expertise in legal research, litigation, and client advisory. Must have strong knowledge of Indian law, excellent drafting skills, and courtroom experience. LL.B. degree and bar council membership are required.",
    "ACCOUNTANT": "Looking for a detail-oriented Accountant proficient in financial reporting, GST filing, TDS, and auditing. Experience with Tally or QuickBooks is essential. Strong analytical skills and knowledge of accounting standards. CA/CPA preferred.",
    "ENGINEERING": "We need an Engineer with expertise in design, project management, and technical problem-solving. Specialization in mechanical, civil, electrical, or electronics engineering. Proficiency in AutoCAD, SolidWorks, or other engineering software. Bachelor's degree in Engineering required.",
    "CHEF": "Seeking a creative and skilled Chef with experience in menu planning, food preparation, and kitchen management. Knowledge of various cuisines, food safety standards, and cost control. Culinary degree or equivalent experience required. Strong leadership and organizational skills.",
    "AVIATION": "Looking for an Aviation professional with background in flight operations, aircraft maintenance, or air traffic control. Relevant certifications (DGCA, FAA) are required. Strong technical knowledge and safety awareness. Degree in Aviation or related field.",
    "FITNESS": "We need a certified Fitness Trainer with expertise in personal training, nutrition guidance, and workout programming. Knowledge of anatomy and physiology. Excellent motivational and communication skills. Certification from ACE, NASM, or equivalent.",
    "SALES": "Seeking a results-driven Sales professional with proven track record in achieving targets. Experience in client acquisition, relationship management, and sales strategies. Excellent negotiation and communication skills. Industry-specific knowledge is a plus.",
    "BANKING": "Looking for a Banking professional with experience in retail banking, operations, or wealth management. Knowledge of banking products, KYC/AML norms, and financial services. Strong customer service and numerical skills. MBA Finance or related degree.",
    "HEALTHCARE": "We need a Healthcare professional (Doctor, Nurse, Technician) with clinical experience and patient care skills. Knowledge of medical terminology, procedures, and healthcare protocols. Relevant degree and certifications (MBBS, BSc Nursing) are required.",
    "CONSULTANT": "Seeking a Management Consultant with strong analytical and problem-solving abilities. Experience in business strategy, process improvement, and data analysis. Excellent presentation and client management skills. MBA from a top-tier institution is preferred.",
    "CONSTRUCTION": "Looking for a Construction Manager or Civil Engineer with project management experience. Knowledge of building codes, safety regulations, and construction methodologies. Proficiency in AutoCAD and MS Project. Degree in Civil Engineering or Construction Management.",
    "PUBLIC-RELATIONS": "We need a PR Executive with experience in media relations, crisis management, and brand communication. Strong writing skills and media network. Ability to develop and execute PR strategies. Degree in Communications, Journalism, or Marketing.",
    "HR": "Seeking an HR Professional with expertise in recruitment, employee relations, and performance management. Knowledge of labor laws and HR best practices. Strong interpersonal and communication skills. MBA in HR or related degree.",
    "DESIGNER": "Looking for a creative Designer with expertise in graphic design, UI/UX, or product design. Proficiency in Adobe Creative Suite and design tools. Strong portfolio showcasing design skills. Degree in Design or related field.",
    "ARTS": "We need an Arts professional with background in visual arts, performing arts, or art education. Creative skills and artistic portfolio. Experience in teaching, curation, or art production. Degree in Fine Arts or related field.",
    "TEACHER": "Seeking a qualified Teacher with experience in curriculum development and student instruction. Subject matter expertise and teaching methodology. Strong communication and classroom management skills. B.Ed. or equivalent teaching degree required.",
    "APPAREL": "Looking for a Fashion Designer or Merchandiser with knowledge of textile design, trend analysis, and production. Creative skills and portfolio. Experience in fashion industry. Degree in Fashion Design or Technology.",
    "DIGITAL-MEDIA": "We need a Digital Marketing Specialist with expertise in SEO, SEM, social media, and content marketing. Analytics skills and knowledge of digital tools. Experience with campaign management. Degree in Marketing or Digital Media.",
    "AGRICULTURE": "Seeking an Agriculture Specialist with knowledge of modern farming techniques, crop management, and agribusiness. Experience with sustainable practices and agricultural technology. Degree in Agriculture or related field.",
    "AUTOMOBILE": "Looking for an Automotive Engineer or Technician with expertise in vehicle design, diagnostics, or repair. Knowledge of automotive systems and technology. Relevant certifications. Degree in Automotive Engineering.",
    "BPO": "We need a BPO Professional with experience in customer service, technical support, or back-office operations. Excellent communication skills and process knowledge. Ability to work in shifts. Graduate with good English proficiency."
}


In [12]:

# 3. Create the training data with match scores
print("Creating training examples with ATS scores...")
training_data = []

for idx, row in df.iterrows():
    resume_text = row['Resume_str']  # Using the text version
    true_category = row['Category']
    
    # Create POSITIVE example (correct category match - high score)
    correct_jd = category_to_jd[true_category]
    correct_score = random.randint(85, 98)  # High score for correct match
    training_data.append({
        "resume_text": resume_text,
        "job_description_text": correct_jd,
        "match_score": correct_score
    })
    
    # Create NEGATIVE examples (wrong category - low score)
    # Get 2-3 random wrong categories
    wrong_categories = random.sample([cat for cat in category_to_jd.keys() if cat != true_category], k=random.randint(2, 3))
    
    for wrong_cat in wrong_categories:
        wrong_jd = category_to_jd[wrong_cat]
        # The more dissimilar the categories, the lower the score
        wrong_score = random.randint(5, 45)  # Low score for wrong match
        training_data.append({
            "resume_text": resume_text,
            "job_description_text": wrong_jd,
            "match_score": wrong_score
        })









Creating training examples with ATS scores...


In [13]:
# Create DataFrame
training_df = pd.DataFrame(training_data)
print(f"Created training dataset with {len(training_df)} examples")
print(f"Score range: {training_df['match_score'].min()} to {training_df['match_score'].max()}")

# 4. Prepare data for model training
training_df['combined_text'] = training_df['job_description_text'] + " " + training_df['resume_text']
X = training_df['combined_text']
y = training_df['match_score']

Created training dataset with 8668 examples
Score range: 5 to 98


In [14]:
# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=pd.cut(y, bins=5))

In [15]:
# 6. Vectorize text
print("Vectorizing text features...")
vectorizer = TfidfVectorizer(
    max_features=8000,      # Increased due to larger dataset
    stop_words='english',
    ngram_range=(1, 2),     # Consider single words and word pairs
    min_df=2,               # Ignore terms that appear only once
    max_df=0.85             # Ignore overly common terms
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Vectorized features: {X_train_tfidf.shape[1]}")

Vectorizing text features...
Vectorized features: 8000


In [17]:
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor

# Initialize the model with warm_start=True
print("Training Random Forest model with progress bar...")

n_estimators = 100

model = RandomForestRegressor(
    n_estimators=1,           # Start with one tree
    warm_start=True,          # Allow incremental training
    max_depth=25,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train incrementally with tqdm
for i in tqdm(range(1, n_estimators + 1), desc="Training Trees"):
    model.set_params(n_estimators=i)
    model.fit(X_train_tfidf, y_train)


Training Random Forest model with progress bar...


Training Trees: 100%|██████████| 100/100 [11:38<00:00,  6.99s/it]


In [18]:
# 8. Evaluate model
train_pred = model.predict(X_train_tfidf)
test_pred = model.predict(X_test_tfidf)

train_mae = mean_absolute_error(y_train, train_pred)
test_mae = mean_absolute_error(y_test, test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))

print("\n=== Model Evaluation ===")
print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE:  {test_mae:.2f}")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE:  {test_rmse:.2f}")



=== Model Evaluation ===
Train MAE: 12.99
Test MAE:  21.74
Train RMSE: 15.93
Test RMSE:  26.56


In [19]:
# 9. Save the model and vectorizer
joblib.dump(model, 'resume_ats_scorer_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(category_to_jd, 'category_jd_mapping.pkl')

print("\nModel and artifacts saved successfully!")
print("Files created:")
print("- resume_ats_scorer_model.pkl (Trained model)")
print("- tfidf_vectorizer.pkl (Fitted vectorizer)")
print("- category_jd_mapping.pkl (JD templates)")


Model and artifacts saved successfully!
Files created:
- resume_ats_scorer_model.pkl (Trained model)
- tfidf_vectorizer.pkl (Fitted vectorizer)
- category_jd_mapping.pkl (JD templates)


In [20]:
# 10. Test with a sample prediction
def predict_ats_score(resume_text, job_description):
    """Predict ATS score for a given resume and JD"""
    combined_text = job_description + " " + resume_text
    features = vectorizer.transform([combined_text])
    score = model.predict(features)[0]
    return max(0, min(100, round(score)))



In [23]:
# Test with a sample
sample_resume = df[df['Category'] == 'INFORMATION-TECHNOLOGY'].iloc[50]['Resume_str']
sample_jd = category_to_jd['INFORMATION-TECHNOLOGY']

print(f"\nSample prediction: {predict_ats_score(sample_resume, sample_jd)}/100")


Sample prediction: 78/100
