In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# Define columns to use
columns_finalized = ['ResponseId', 'Age', 'Employment', 'DevType', 'AISelect', 'EdLevel', 'LearnCode', 'YearsCode',
                     'LanguageHaveWorkedWith', 'Country', 'OpSysPersonal use', 'NEWCollabToolsHaveWorkedWith',
                     'LanguageWantToWorkWith', 'OfficeStackSyncHaveWorkedWith', 'RemoteWork', 'ToolsTechHaveWorkedWith',
                     'DatabaseHaveWorkedWith', 'LearnCodeOnline', 'OrgSize', 'ToolsTechWantToWorkWith', 'BuyNewTool',
                     'WebframeHaveWorkedWith', 'AIThreat', 'ToolsTechAdmired', 'BuildvsBuy', 'DatabaseWantToWorkWith',
                     'PlatformHaveWorkedWith', 'AIEthics', 'TechDoc', 'YearsCodePro', 'MiscTechHaveWorkedWith',
                     'DatabaseAdmired', 'WebframeWantToWorkWith', 'WebframeAdmired', 'ConvertedCompYearly',
                     'MiscTechWantToWorkWith', 'WorkExp', 'MiscTechAdmired', 'JobSat', 'TimeSearching', 'Industry',
                     'TimeAnswering', 'ProfessionalQuestion', 'ProfessionalCloud', 'Frustration', 'ProfessionalTech']

def load_and_preprocess_data(file_path, columns):
    df = (pd.read_csv(file_path, usecols=columns)
          .fillna("empty")
          .query("DevType != 'empty' and DevType != 'Other (please specify):'")
          .reset_index(drop=True))
    return df

def combine_skills(df, columns, new_column_name):
    df[new_column_name] = (df[columns]
                           .fillna('')
                           .agg(';'.join, axis=1)
                           .apply(lambda x: x.split(';') if x else []))
    return df

def create_job_profiles(df):
    job_profiles = df.groupby('DevType').agg({
        'LanguageHaveWorkedWith': lambda x: ';'.join([str(i) for i in x if pd.notna(i) and i != 'empty']),
        'ToolsTechHaveWorkedWith': lambda x: ';'.join([str(i) for i in x if pd.notna(i) and i != 'empty']),
        'PlatformHaveWorkedWith': lambda x: ';'.join([str(i) for i in x if pd.notna(i) and i != 'empty']),
        'MiscTechHaveWorkedWith': lambda x: ';'.join([str(i) for i in x if pd.notna(i) and i != 'empty']),
        'WebframeHaveWorkedWith': lambda x: ';'.join([str(i) for i in x if pd.notna(i) and i != 'empty']),
        'DatabaseHaveWorkedWith': lambda x: ';'.join([str(i) for i in x if pd.notna(i) and i != 'empty']),
    }).reset_index()

    def clean_skills(skill_str):
        skills = skill_str.split(';')
        return dict(Counter(skills))

    def top_skills(skill_dict, n=10):
        return sorted(skill_dict, key=skill_dict.get, reverse=True)[:n]

    column_mapping = {
        'Languages': 'LanguageHaveWorkedWith',
        'Tools': 'ToolsTechHaveWorkedWith',
        'WebFrameworks': 'WebframeHaveWorkedWith',
        'Databases': 'DatabaseHaveWorkedWith',
        'Platforms': 'PlatformHaveWorkedWith',
        'MiscTechs': 'MiscTechHaveWorkedWith'
    }

    for col, original_col in column_mapping.items():
        job_profiles[col] = job_profiles[original_col].apply(clean_skills)
        job_profiles[f'Top{col}'] = job_profiles[col].apply(lambda x: top_skills(x, 10))

    job_profiles = job_profiles[['DevType', 'TopLanguages', 'TopTools', 'TopWebFrameworks', 'TopDatabases',
                                 'TopPlatforms', 'TopMiscTechs']]
    return job_profiles

def jaccard_similarity(user_skills, job_skills):
    user_skillz = set(user_skills) - {'empty'}
    job_skillz = set(job_skills) - {'empty'}
    intersection = len(user_skillz.intersection(job_skillz))
    union = len(user_skillz.union(job_skillz))
    return intersection / union if union != 0 else 0

def create_interaction_matrix(df_have, job_profiles):
    interaction_matrix = pd.DataFrame(index=df_have['ResponseId'], columns=job_profiles['DevType'])
    for user_id, user_skills in df_have[['ResponseId', 'AllSkillsHaveWorkedWith']].itertuples(index=False):
        for DevType, TopLanguages, TopTools, TopWebFrameworks, TopDatabases, TopPlatforms, TopMiscTechs in job_profiles.itertuples(index=False):
            job_skill_set = TopLanguages + TopTools + TopWebFrameworks + TopDatabases + TopPlatforms + TopMiscTechs
            similarity_score = jaccard_similarity(user_skills, job_skill_set)
            interaction_matrix.at[user_id, DevType] = similarity_score
    return interaction_matrix.fillna(0)

def train_svd_model(interaction_matrix, n_components=10):
    svd = TruncatedSVD(n_components=n_components)
    user_factors = svd.fit_transform(interaction_matrix)
    job_factors = svd.components_.T
    return svd, user_factors, job_factors

def recommend_jobs_for_user(user_id, reconstructed_matrix, n=5):
    user_scores = reconstructed_matrix.loc[user_id]
    top_jobs = user_scores.nlargest(n).index.tolist()
    return top_jobs

def calculate_mrr(true_jobs, predicted_jobs):
    mrr = 0.0
    for user_index, true_job in enumerate(true_jobs):
        try:
            rank = np.where(predicted_jobs[user_index] == true_job)[0][0] + 1
            mrr += 1 / rank
        except IndexError:
            continue
    return mrr / len(true_jobs)

In [2]:
# Load and preprocess data
df = load_and_preprocess_data("survey_results_public.csv", columns_finalized)
df = combine_skills(df, ['LanguageHaveWorkedWith', 'ToolsTechHaveWorkedWith', 'PlatformHaveWorkedWith',
                         'MiscTechHaveWorkedWith', 'WebframeHaveWorkedWith', 'DatabaseHaveWorkedWith'], 
                    'AllSkillsHaveWorkedWith')
df = combine_skills(df, ['LanguageWantToWorkWith', 'ToolsTechWantToWorkWith', 'WebframeWantToWorkWith',
                         'MiscTechWantToWorkWith', 'DatabaseWantToWorkWith'], 
                    'AllSkillsWantToWorkWith')

# Filter out students
df_non_students = df.query("DevType != 'Student'").reset_index(drop=True)

In [3]:
# from sklearn.model_selection import StratifiedShuffleSplit

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_idx, test_idx in split.split(df_non_students, df_non_students['DevType']):
#     train_df = df_non_students.iloc[train_idx].reset_index(drop=True)
#     test_df = df_non_students.iloc[test_idx].reset_index(drop=True)

In [4]:
train_df = df_non_students[:len(df_non_students)-5000].copy()
test_df = df_non_students[len(df_non_students)-5000:].copy()

In [6]:
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display(train_df)

Unnamed: 0,ResponseId,Age,Employment,RemoteWork,EdLevel,LearnCode,LearnCodeOnline,TechDoc,YearsCode,YearsCodePro,DevType,OrgSize,BuyNewTool,BuildvsBuy,Country,LanguageHaveWorkedWith,LanguageWantToWorkWith,DatabaseHaveWorkedWith,DatabaseWantToWorkWith,DatabaseAdmired,PlatformHaveWorkedWith,WebframeHaveWorkedWith,WebframeWantToWorkWith,WebframeAdmired,MiscTechHaveWorkedWith,MiscTechWantToWorkWith,MiscTechAdmired,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,ToolsTechAdmired,NEWCollabToolsHaveWorkedWith,OpSysPersonal use,OfficeStackSyncHaveWorkedWith,AISelect,AIThreat,AIEthics,WorkExp,TimeSearching,TimeAnswering,Frustration,ProfessionalTech,ProfessionalCloud,ProfessionalQuestion,Industry,ConvertedCompYearly,JobSat,AllSkillsHaveWorkedWith,AllSkillsWantToWorkWith
0,2,35-44 years old,"Employed, full-time",Remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,API document(s) and/or SDK document(s);User gu...,20,17,"Developer, full-stack",empty,empty,empty,United Kingdom of Great Britain and Northern I...,Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...,Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...,Dynamodb;MongoDB;PostgreSQL,PostgreSQL,PostgreSQL,Amazon Web Services (AWS);Heroku;Netlify,Express;Next.js;Node.js;React,Express;Htmx;Node.js;React;Remix,Express;Node.js;React,empty,empty,empty,Docker;Homebrew;Kubernetes;npm;Vite;Webpack,Docker;Homebrew;Kubernetes;npm;Vite;Webpack,Docker;Homebrew;Kubernetes;npm;Vite;Webpack,PyCharm;Visual Studio Code;WebStorm,MacOS;Windows,Microsoft Teams;Slack,"No, and I don't plan to",empty,empty,17.0,empty,empty,empty,empty,empty,empty,empty,empty,empty,"[Bash/Shell (all shells), Go, HTML/CSS, Java, ...","[Bash/Shell (all shells), Go, HTML/CSS, Java, ..."
1,3,45-54 years old,"Employed, full-time",Remote,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,API document(s) and/or SDK document(s);User gu...,37,27,Developer Experience,empty,empty,empty,United Kingdom of Great Britain and Northern I...,C#,C#,Firebase Realtime Database,Firebase Realtime Database,Firebase Realtime Database,Google Cloud,ASP.NET CORE,ASP.NET CORE,ASP.NET CORE,.NET (5+) ;.NET Framework (1.0 - 4.8);.NET MAUI,.NET (5+) ;.NET Framework (1.0 - 4.8);.NET MAUI,.NET (5+) ;.NET Framework (1.0 - 4.8);.NET MAUI,MSBuild,MSBuild,MSBuild,Visual Studio,Windows,Google Chat;Google Meet;Microsoft Teams;Zoom,"No, and I don't plan to",empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty,"[C#, MSBuild, Google Cloud, .NET (5+) , .NET F...","[C#, MSBuild, ASP.NET CORE, .NET (5+) , .NET F..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46883,55784,35-44 years old,"Employed, full-time",Remote,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;On the job training;Oth...,Technical documentation;Blogs;Written Tutorial...,API document(s) and/or SDK document(s);User gu...,22,17,Cloud infrastructure engineer,"10,000 or more employees",Start a free trial;Ask developers I know/work ...,Is ready-to-go but also customizable for growt...,Greece,C#;Go;Python,Go;Python,Cosmos DB;Databricks SQL;Dynamodb,BigQuery;Dynamodb,Dynamodb,Amazon Web Services (AWS);Databricks;Google Cl...,ASP.NET CORE,ASP.NET CORE,ASP.NET CORE,NumPy,NumPy,NumPy,Ansible;Docker;Homebrew;Kubernetes;npm;Pip;Ter...,Ansible;Docker;Homebrew;Kubernetes;npm;Pip;Ter...,Ansible;Docker;Homebrew;Kubernetes;npm;Pip;Ter...,Visual Studio Code,Ubuntu;Windows,Microsoft Teams,Yes,I'm not sure,Circulating misinformation or disinformation;M...,22.0,30-60 minutes a day,60-120 minutes a day,Tracking my work;Showing my contributions;Patc...,DevOps function;Observability tools;AI-assiste...,Hybrid (on-prem and cloud),Do search of internal share drives/storage loc...,Retail and Consumer Services,empty,9.0,"[C#, Go, Python, Ansible, Docker, Homebrew, Ku...","[Go, Python, Ansible, Docker, Homebrew, Kubern..."
46884,55785,55-64 years old,"Employed, full-time",In-person,"Professional degree (JD, MD, Ph.D, Ed.D, etc.)",Books / Physical media;Other online resources ...,Technical documentation;Books;Written Tutorial...,API document(s) and/or SDK document(s);First-p...,44,22,Academic researcher,500 to 999 employees,Start a free trial;Ask developers I know/work ...,Is set up to be customized and needs to be eng...,Canada,Bash/Shell (all shells);HTML/CSS;Java;JavaScri...,Bash/Shell (all shells);HTML/CSS;Java;JavaScri...,MariaDB;MongoDB;MySQL;SQLite,MariaDB;MySQL;PostgreSQL;SQLite,MariaDB;MySQL;SQLite,Heroku;PythonAnywhere,Django;jQuery;Node.js;React,Django;Flask;Node.js;React;Spring Boot;Vue.js,Django;Node.js;React,CUDA;NumPy;Pandas;Scikit-Learn;Spring Framewor...,CUDA;Electron;Hugging Face Transformers;NumPy;...,CUDA;NumPy;Pandas;Scikit-Learn;Spring Framewor...,Chocolatey;Gradle;Maven (build tool);npm;Pip;V...,Docker;Kubernetes;Maven (build tool);npm;Pip,Maven (build tool);npm;Pip,Eclipse;IntelliJ IDEA;Jupyter Notebook/Jupyter...,Arch;Windows;Windows Subsystem for Linux (WSL),Cisco Webex Teams;Microsoft Teams;Whatsapp;Zoom,Yes,Yes,Circulating misinformation or disinformation;B...,25.0,Over 120 minutes a day,30-60 minutes a day,Amount of technical debt;Showing my contributi...,Automated testing;Developer portal or other ce...,Hybrid (on-prem and cloud),Traditional public search engine,Higher Education,empty,7.0,"[Bash/Shell (all shells), HTML/CSS, Java, Java...","[Bash/Shell (all shells), HTML/CSS, Java, Java..."


In [7]:
# Create job profiles for training set
job_profiles_train = create_job_profiles(train_df)

In [8]:
# Prepare data for interaction matrix (training set)
df_have_train = train_df.explode('AllSkillsHaveWorkedWith')[['ResponseId', 'AllSkillsHaveWorkedWith']].reset_index(drop=True)
df_have_train = df_have_train.groupby('ResponseId')['AllSkillsHaveWorkedWith'].apply(list).reset_index()

In [9]:
df_have_train

Unnamed: 0,ResponseId,AllSkillsHaveWorkedWith
0,2,"[Bash/Shell (all shells), Go, HTML/CSS, Java, ..."
1,3,"[C#, MSBuild, Google Cloud, .NET (5+) , .NET F..."
2,4,"[C, C++, HTML/CSS, Java, JavaScript, PHP, Powe..."
3,5,"[C++, HTML/CSS, JavaScript, Lua, Python, Rust,..."
4,7,"[R, empty, empty, empty, empty, empty]"
...,...,...
46880,55780,"[Bash/Shell (all shells), Java, PHP, SQL, Dock..."
46881,55781,"[C#, GDScript, HTML/CSS, JavaScript, PowerShel..."
46882,55783,"[JavaScript, TypeScript, npm, Webpack, Microso..."
46883,55784,"[C#, Go, Python, Ansible, Docker, Homebrew, Ku..."


In [11]:
job_profiles_train

Unnamed: 0,DevType,TopLanguages,TopTools,TopWebFrameworks,TopDatabases,TopPlatforms,TopMiscTechs
0,Academic researcher,"[Python, Bash/Shell (all shells), C++, C, Java...","[Pip, Docker, Make, npm, Homebrew, APT, Pacman...","[Node.js, React, Flask, Django, jQuery, WordPr...","[SQLite, PostgreSQL, MySQL, MongoDB, MariaDB, ...","[Amazon Web Services (AWS), Google Cloud, Micr...","[NumPy, Pandas, Scikit-Learn, Torch/PyTorch, T..."
1,Blockchain,"[JavaScript, TypeScript, Python, Rust, HTML/CS...","[Docker, npm, Yarn, Make, Pip, Homebrew, Kuber...","[Node.js, React, Next.js, Express, Angular, Ne...","[PostgreSQL, SQLite, MySQL, MongoDB, Redis, Ma...","[Amazon Web Services (AWS), Google Cloud, Verc...","[Pandas, React Native, .NET (5+) , Apache Kafk..."
2,Cloud infrastructure engineer,"[Python, Bash/Shell (all shells), JavaScript, ...","[Docker, Kubernetes, Terraform, Pip, npm, Home...","[Node.js, React, Flask, FastAPI, Django, ASP.N...","[PostgreSQL, MySQL, Redis, SQLite, Dynamodb, E...","[Amazon Web Services (AWS), Microsoft Azure, G...","[Pandas, NumPy, Apache Kafka, RabbitMQ, .NET (..."
3,Data engineer,"[Python, SQL, Bash/Shell (all shells), JavaScr...","[Docker, Pip, Kubernetes, Homebrew, Terraform,...","[FastAPI, Flask, Node.js, React, Django, jQuer...","[PostgreSQL, Microsoft SQL Server, MySQL, SQLi...","[Amazon Web Services (AWS), Microsoft Azure, G...","[Pandas, NumPy, Apache Spark, Apache Kafka, Sc..."
4,Data or business analyst,"[SQL, Python, JavaScript, HTML/CSS, Bash/Shell...","[Pip, Docker, npm, Visual Studio Solution, Mak...","[Node.js, React, WordPress, Flask, Django, jQu...","[Microsoft SQL Server, PostgreSQL, MySQL, SQLi...","[Amazon Web Services (AWS), Microsoft Azure, G...","[Pandas, NumPy, Scikit-Learn, Tidyverse, .NET ..."
5,Data scientist or machine learning specialist,"[Python, SQL, Bash/Shell (all shells), JavaScr...","[Pip, Docker, Make, npm, Homebrew, Kubernetes,...","[Flask, FastAPI, Django, Node.js, React, jQuer...","[PostgreSQL, SQLite, MySQL, Microsoft SQL Serv...","[Amazon Web Services (AWS), Microsoft Azure, G...","[Pandas, NumPy, Scikit-Learn, Torch/PyTorch, T..."
6,Database administrator,"[SQL, Bash/Shell (all shells), Python, HTML/CS...","[Docker, Ansible, Visual Studio Solution, Terr...","[Node.js, jQuery, ASP.NET, WordPress, ASP.NET ...","[Microsoft SQL Server, PostgreSQL, Oracle, MyS...","[Amazon Web Services (AWS), Microsoft Azure, O...","[Pandas, .NET (5+) , .NET Framework (1.0 - 4.8..."
7,Designer,"[HTML/CSS, JavaScript, Python, SQL, TypeScript...","[npm, Docker, Vite, Homebrew, Pip, Yarn, Visua...","[React, Node.js, WordPress, jQuery, Next.js, S...","[MySQL, PostgreSQL, SQLite, MongoDB, Microsoft...","[Amazon Web Services (AWS), Cloudflare, Google...","[.NET (5+) , OpenGL, Flutter, NumPy, Electron,..."
8,DevOps specialist,"[Python, Bash/Shell (all shells), SQL, JavaScr...","[Docker, Kubernetes, Terraform, Pip, Ansible, ...","[Node.js, Flask, React, Django, jQuery, Spring...","[PostgreSQL, MySQL, Redis, SQLite, Elasticsear...","[Amazon Web Services (AWS), Microsoft Azure, G...","[Pandas, .NET (5+) , RabbitMQ, NumPy, Apache K..."
9,Developer Advocate,"[JavaScript, HTML/CSS, Python, SQL, TypeScript...","[npm, Docker, Pip, Homebrew, Kubernetes, Gradl...","[Node.js, React, jQuery, WordPress, ASP.NET CO...","[PostgreSQL, SQLite, MySQL, MongoDB, Microsoft...","[Amazon Web Services (AWS), Google Cloud, Micr...","[.NET (5+) , .NET Framework (1.0 - 4.8), Panda..."


In [12]:
# Create interaction matrix for training set
interaction_matrix_train = create_interaction_matrix(df_have_train, job_profiles_train)
interaction_matrix_train

DevType,Academic researcher,Blockchain,Cloud infrastructure engineer,Data engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,Designer,DevOps specialist,Developer Advocate,...,Engineering manager,Hardware Engineer,Marketing or sales professional,Product manager,Project manager,Research & Development role,Scientist,Security professional,"Senior Executive (C-Suite, VP, etc.)",System administrator
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.257576,0.317460,0.296875,0.276923,0.220588,0.257576,0.220588,0.301587,0.276923,0.360656,...,0.383333,0.202899,0.281250,0.276923,0.257576,0.257576,0.185714,0.257576,0.383333,0.238806
3,0.014925,0.030303,0.079365,0.046154,0.079365,0.014925,0.079365,0.080645,0.062500,0.096774,...,0.079365,0.062500,0.080645,0.096774,0.096774,0.096774,0.014925,0.062500,0.079365,0.062500
4,0.369231,0.348485,0.328358,0.369231,0.308824,0.390625,0.328358,0.353846,0.328358,0.390625,...,0.348485,0.328358,0.375000,0.390625,0.390625,0.348485,0.328358,0.348485,0.369231,0.369231
5,0.145161,0.145161,0.126984,0.126984,0.126984,0.145161,0.109375,0.129032,0.126984,0.126984,...,0.126984,0.163934,0.111111,0.109375,0.126984,0.126984,0.163934,0.145161,0.109375,0.126984
7,0.016667,0.000000,0.000000,0.000000,0.016667,0.016667,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016667,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55780,0.215385,0.196970,0.215385,0.253968,0.196970,0.196970,0.196970,0.181818,0.253968,0.234375,...,0.234375,0.179104,0.164179,0.215385,0.253968,0.215385,0.161765,0.196970,0.215385,0.196970
55781,0.159420,0.159420,0.212121,0.194030,0.250000,0.176471,0.212121,0.215385,0.212121,0.212121,...,0.194030,0.194030,0.179104,0.250000,0.230769,0.230769,0.159420,0.194030,0.212121,0.212121
55783,0.063492,0.116667,0.080645,0.080645,0.080645,0.098361,0.063492,0.118644,0.098361,0.116667,...,0.116667,0.063492,0.100000,0.098361,0.098361,0.098361,0.063492,0.063492,0.116667,0.063492
55784,0.128571,0.179104,0.253968,0.215385,0.179104,0.196970,0.215385,0.147059,0.234375,0.196970,...,0.234375,0.112676,0.218750,0.179104,0.161765,0.196970,0.128571,0.234375,0.234375,0.179104


In [13]:
# Train the model
svd, user_factors_train, job_factors = train_svd_model(interaction_matrix_train)

# Reconstruct matrix for training set
reconstructed_train = np.dot(user_factors_train, job_factors.T)
reconstructed_train_df = pd.DataFrame(reconstructed_train, index=interaction_matrix_train.index, columns=interaction_matrix_train.columns)

In [14]:
# Evaluate on train set
true_jobs_train = train_df['DevType'].apply(lambda x: x[0] if isinstance(x, list) else x)
predicted_jobs_train = [reconstructed_train_df.loc[user].sort_values(ascending=False).index[:10] for user in reconstructed_train_df.index]

mrr_score_test = calculate_mrr(true_jobs_train, predicted_jobs_train)
print(f'MRR Score on Train Set: {mrr_score_test}')

MRR Score on Train Set: 0.32358198293361484


In [24]:
# Prepare data for interaction matrix (test set)
df_have_test = test_df.explode('AllSkillsHaveWorkedWith')[['ResponseId', 'AllSkillsHaveWorkedWith']].reset_index(drop=True)
df_have_test = df_have_test.groupby('ResponseId')['AllSkillsHaveWorkedWith'].apply(list).reset_index()

# Create interaction matrix for test set (using job profiles from training set)
interaction_matrix_test = create_interaction_matrix(df_have_test, job_profiles_train)

# Transform test set
user_factors_test = svd.transform(interaction_matrix_test)
reconstructed_test = np.dot(user_factors_test, job_factors.T)
reconstructed_test_df = pd.DataFrame(reconstructed_test, index=interaction_matrix_test.index, columns=interaction_matrix_test.columns)

# Evaluate on test set
true_jobs_test = test_df['DevType'].apply(lambda x: x[0] if isinstance(x, list) else x)
predicted_jobs_test = [reconstructed_test_df.loc[user].sort_values(ascending=False).index[:10] for user in reconstructed_test_df.index]

mrr_score_test = calculate_mrr(true_jobs_test, predicted_jobs_test)
print(f'MRR Score on Test Set: {mrr_score_test}')

# Example: Get top 5 job recommendations for a specific user in the test set
user_id = interaction_matrix_test.index[2]
recommended_jobs = recommend_jobs_for_user(user_id, reconstructed_test_df, n=5)
print(f"Top job recommendations for user {user_id}: {recommended_jobs}")

MRR Score on Test Set: 0.2609004761904757
Top job recommendations for user 55788: ['Developer, back-end', 'Developer, QA or test', 'Developer, desktop or enterprise applications', 'Data or business analyst', 'DevOps specialist']
