<a href="https://colab.research.google.com/github/lavanyasaxena01/gen-ai-career-guidance/blob/main/gen_ai_career_guidance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [None]:
df = pd.read_csv("/content/ai_job_dataset.csv")

df['required_skills'] = df['required_skills'].str.lower()
df.dropna(inplace=True)

df.head()

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,"tableau, pytorch, kubernetes, linux, nlp",Bachelor,9,Automotive,2024-10-18,2024-11-07,1076,5.9,Smart Analytics
1,AI00002,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,"deep learning, aws, mathematics, python, docker",Master,1,Media,2024-11-20,2025-01-11,1268,5.2,TechCorp Inc
2,AI00003,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,"kubernetes, deep learning, java, hadoop, nlp",Associate,2,Education,2025-03-18,2025-04-07,1974,9.4,Autonomous Tech
3,AI00004,NLP Engineer,80215,USD,SE,FL,India,M,India,50,"scala, sql, linux, python",PhD,7,Consulting,2024-12-23,2025-02-24,1345,8.6,Future Systems
4,AI00005,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,"mlops, java, tableau, python",Master,0,Media,2025-04-15,2025-06-23,1989,6.6,Advanced Robotics


In [None]:
le = LabelEncoder()

df['experience_level_encoded'] = le.fit_transform(df['experience_level'])
df['education_encoded'] = le.fit_transform(df['education_required'])

In [None]:
vectorizer = TfidfVectorizer(max_features=50)

skill_matrix = vectorizer.fit_transform(df['required_skills'])

In [None]:
structured_features = df[['experience_level_encoded',
                           'education_encoded',
                           'years_experience']].values

X_combined = hstack([skill_matrix, structured_features])

y = df['salary_usd']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(
    n_estimators=150,
    max_depth=10,
    min_samples_split=5,
    random_state=42
)

model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))

Train Score: 0.6563834850184962
Test Score: 0.6045928803932713


In [None]:
def recommend_careers(student_skills):

    student_vector = vectorizer.transform([student_skills.lower()])
    similarity = cosine_similarity(student_vector, skill_matrix)

    df['similarity_score'] = similarity[0]

    # Normalize salary
    df['salary_normalized'] = (
        df['salary_usd'] - df['salary_usd'].min()
    ) / (df['salary_usd'].max() - df['salary_usd'].min())

    # Demand score
    demand = df['job_title'].value_counts(normalize=True)
    df['demand_score'] = df['job_title'].map(demand)

    # Hybrid score
    df['career_score'] = (
        0.6 * df['similarity_score'] +
        0.3 * df['salary_normalized'] +
        0.1 * df['demand_score']
    )

    return df.sort_values(by='career_score', ascending=False).head(5)

In [None]:
def skill_gap(student_skills, career_skills):
    student_set = set(student_skills.lower().split(", "))
    career_set = set(career_skills.lower().split(", "))
    return list(career_set - student_set)

In [None]:
student_input = "python, machine learning"

top_careers = recommend_careers(student_input)

top_careers[['job_title', 'salary_usd', 'career_score']]

Unnamed: 0,job_title,salary_usd,career_score
1144,AI Specialist,370038,0.636015
11938,Robotics Engineer,346789,0.625197
2464,ML Ops Engineer,295104,0.624085
9637,AI Software Engineer,379418,0.618777
8959,Principal Data Scientist,291570,0.617608


In [None]:
%%writefile app.py
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("ai_job_dataset.csv")
df['required_skills'] = df['required_skills'].str.lower()
df.dropna(inplace=True)

vectorizer = TfidfVectorizer(max_features=50)
skill_matrix = vectorizer.fit_transform(df['required_skills'])

st.title("ðŸš€ CareerSense AI")

student_skills = st.text_input("Enter your skills (comma separated)")

if st.button("Find Career Recommendations"):

    student_vector = vectorizer.transform([student_skills.lower()])
    similarity = cosine_similarity(student_vector, skill_matrix)

    df['similarity_score'] = similarity[0]

    results = df.sort_values(by='similarity_score', ascending=False).head(5)

    st.dataframe(results[['job_title', 'salary_usd']])

Overwriting app.py


In [None]:
!pip install streamlit




In [None]:
!apt-get install -y nodejs npm

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  gyp javascript-common libc-ares2 libjs-events libjs-highlight.js
  libjs-inherits libjs-is-typedarray libjs-psl libjs-source-map
  libjs-sprintf-js libjs-typedarray-to-buffer libnode-dev libnode72
  libnotify-bin libnotify4 libuv1-dev node-abab node-abbrev node-agent-base
  node-ansi-regex node-ansi-styles node-ansistyles node-aproba node-archy
  node-are-we-there-yet node-argparse node-arrify node-asap node-asynckit
  node-balanced-match node-brace-expansion node-builtins node-cacache
  node-chalk node-chownr node-clean-yaml-object node-cli-table node-clone
  node-color-convert node-color-name node-colors node-columnify
  node-combined-stream node-commander node-console-control-strings
  node-copy-concurrently node-core-util-is node-coveralls node-cssom
  node-cssstyle node-debug node-decompress-response node-defaults
  node-delayed-st

In [None]:
!npm install -g localtunnel

[K[?25h
added 22 packages, and audited 23 packages in 3s

3 packages are looking for funding
  run `npm fund` for details

1 [31m[1mhigh[22m[39m severity vulnerability

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.


In [None]:
!streamlit run app.py & lt --port 8501

your url is: https://clever-moose-punch.loca.lt

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.185.49.97:8501[0m
[0m
[34m  Stopping...[0m
^C


In [None]:
!curl ipv4.icanhazip.com

35.185.49.97
