In [14]:
import pandas as pd
import numpy as np
import re

In [16]:
df = pd.read_parquet("train-00000-of-00001.parquet")
df = df.dropna(subset=['job_title', 'job_skill_set', 'job_description'])
print(df.head())

       job_id category                                 job_title  \
0  3902668440       HR              Sr Human Resource Generalist   
1  3905823748       HR                   Human Resources Manager   
2  3905854799       HR               Director of Human Resources   
3  3905834061       HR             Chief Human Resources Officer   
4  3906250451       HR  Human Resources Generalist (Hybrid Role)   

                                     job_description  \
0  SUMMARY\nTHE SR. HR GENERALIST PROVIDES HR EXP...   
1  BE PART OF A STELLAR TEAM AT YSB AS THE MANAGE...   
2  OUR CLIENT IS A THRIVING ORGANIZATION OFFERING...   
3  JOB TITLE: CHIEF HUMAN RESOURCES OFFICER (CHRO...   
4  DESCRIPTION\n\n WHO WE ARE \n\nAVI-SPL IS A DI...   

                                       job_skill_set  
0  ['employee relations', 'talent acquisition', '...  
1  ['Talent Acquisition', 'Employee Performance M...  
2  ['Human Resources Management', 'Recruitment', ...  
3  ['talent management', 'organiza

In [18]:
job_counts = df['job_title'].value_counts()
threshold = 10
titles_to_keep = job_counts[job_counts >= threshold].index

df_filtered = df[df['job_title'].isin(titles_to_keep)].copy()

print(df_filtered.head())

        job_id category                   job_title  \
1   3905823748       HR     Human Resources Manager   
5   3901389277       HR     Human Resources Manager   
6   3902348043       HR  Human Resources Generalist   
8   3891070825       HR  Human Resources Generalist   
12  3894573937       HR  Human Resources Generalist   

                                      job_description  \
1   BE PART OF A STELLAR TEAM AT YSB AS THE MANAGE...   
5   JOB DESCRIPTION: Â· THE HR MANAGER WILL SUPPORT...   
6   DRIVE YOUR FUTURE WITH TURN 14 DISTRIBUTION! N...   
8   DIRECT-HIRE, $65,000 SALARY\nTHE IDEAL CANDIDA...   
12  COMPANY INFORMATION\n\nFOR MORE THAN 20 YEARS,...   

                                        job_skill_set  
1   ['Talent Acquisition', 'Employee Performance M...  
5   ['HR management', 'talent acquisition', 'labor...  
6   ['Microsoft Office', 'communication', 'attenti...  
8   ['recruitment', 'payroll administration', 'com...  
12  ['HRIS systems', 'Microsoft Office', 'dat

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [28]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = " ".join(text.split())
    return text

df_filtered['job_title'] = df_filtered['job_title'].str.lower()
df_filtered['clean_skills'] = df_filtered['job_skill_set'].apply(clean)
df_filtered['clean_desc'] = df_filtered['job_description'].apply(clean)

df_filtered['combined_features'] = df_filtered['clean_skills'] + " " + df_filtered['clean_desc']
df_final = df_filtered[['combined_features', 'job_title']].copy()
df_final.columns = ['skills', 'job_title']

tfidf = TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1, 2))

print(df_final.head())

                                               skills  \
1   talent acquisition employee performance manage...   
5   hr management talent acquisition labor complia...   
6   microsoft office communication attention to de...   
8   recruitment payroll administration compliance ...   
12  hris systems microsoft office data analysis pr...   

                     job_title  
1      human resources manager  
5      human resources manager  
6   human resources generalist  
8   human resources generalist  
12  human resources generalist  


In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = tfidf.fit_transform(df_final['skills'])
y = df_final['job_title']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print("--- Model Accuracy Results ---")
print(f"Naive Bayes Accuracy: {nb_acc * 100:.2f}%")
print(f"Logistic Regression Accuracy: {lr_acc * 100:.2f}%")
print(f"Random Forest Accuracy: {rf_acc * 100:.2f}%")

--- Model Accuracy Results ---
Naive Bayes Accuracy: 42.31%
Logistic Regression Accuracy: 57.69%
Random Forest Accuracy: 65.38%


In [30]:
def recommend_job(input_skills):
    cleaned_input = clean(input_skills)
    input_vector = tfidf.transform([cleaned_input])
    
    prediction = rf_model.predict(input_vector)
    probabilities = rf_model.predict_proba(input_vector)
    confidence = np.max(probabilities) * 100
    
    return prediction[0], confidence

In [32]:
print("Welcome to the AI Career Assistant!")
print("Enter 'quit' to exit.")

while True:
    user_input = input("\nPlease enter your skills (separated by commas): ")
    
    if user_input == 'quit':
        print("Closing application. Good luck with your job search!")
        break
    
    job, score = recommend_job(user_input)
    
    print("-" * 30)
    print(f"AI ANALYSIS:")
    print(f"Based on your skills, you are a great fit for: {job.upper()}")
    print(f"Match Confidence: {score:.2f}%")
    print("-" * 30)

Welcome to the AI Career Assistant!
Enter 'quit' to exit.



Please enter your skills (separated by commas):  quit


Closing application. Good luck with your job search!
