In [1]:
import openai
import pandas as pd
import numpy as np
import ast
import csv
import concurrent.futures
import os
import threading

lock = threading.Lock()

In [2]:
os.listdir("..")

['.git',
 '.gitignore',
 '.gitmodules',
 'classification',
 'data',
 'data_processing',
 'failed_attemps',
 'geocoding',
 'LICENSE',
 'notebooks',
 'plots',
 'README.md',
 'topjobs']

In [3]:
salaires = ["produits alimentaires","vêtements, lingerie,chaussures,literies", "industrie textile", "industrie du papier", 
            "arts graphiques", "industrie chimique", "bois,liège,meubles", "pierre et terre", "industrie des métaux",  
            "horlogerie","bijouterie,gravure,frappe","construction,charpenterie","gaz,eau,éléctricité","commerce de gros",
            "commerce de détail","banques,établissements financiers","assurances privées","agences,locations,consultations",
            "affaires immobilières,location","bureaux de consultation","hôtellerie,restauration","transports","administration publique",
            "réparations", "vachers célibataires","employés pour tous,célibataires","employées pour le ménage et la ferme",
            "journaliers,entretien","journalières,entretien"]

In [5]:
df = pd.concat([pd.read_csv(os.path.join("..", "data", df_clf)) for df_clf in os.listdir('../data') if df_clf.endswith('_clf.csv') and '1923' not in df_clf])
df_salaire = pd.read_excel('../data/SHS_TM.xlsx',index_col=0, header=1)

# get every job already classified, so if we encounter this job again, we dont have to recompute its classification
dict_clf = {}
def create_dict_job_clf(df):
    for _, row in df.iterrows():
        job, job_clf = row.job, row.job_clf
        dict_clf[job] = job_clf
    
    return dict_clf
dict_clf = create_dict_job_clf(df)

tdf = pd.read_csv("../data/table_1923.csv").dropna()
tdf['job_clf'] = tdf.JOB.apply(lambda x: dict_clf.get(x, None))
tdf.dropna(inplace=True)
tdf['salaire'] = tdf.job_clf.apply(lambda x: df_salaire[x].loc[1923])

from difflib import get_close_matches
def find_closest_job(job):
    matches = get_close_matches(job, salaires, n=1, cutoff=0.5)
    if matches:
        col = matches[0]
    else:
        col = job
    return col
tdf['job_clf'] = tdf['job_clf'].apply(find_closest_job)

In [7]:
tdf.to_csv("../data/table_1923_clf_v2.csv")

In [20]:
def load_df(path_annuaire):
    """_summary_

    Args:
        path_annuaire (str): patht to the annuaire

    Returns:
        pd.DataFrame: a DataFrame cleaned
    """
    df = pd.read_csv(path_annuaire).rename(columns={'JOB':'job'})
    df = df.dropna().reset_index(drop=True)
    return df
df_table = load_df('data/table_1951.csv')

In [21]:
def classify_job_already_classified(df):
    """_summary_
        En gros, avec category_mapping on a deja classifier des métiers, au lieu
        de reclassifier avec le LLM qui est cher et long, on map les métiers deja classifier
        
    """
    df['job_clf'] = df.job.apply(lambda x: dict_clf.get(x, None))
    return df.copy(deep=True)


In [22]:
df_table = classify_job_already_classified(df_table)

In [23]:
#Load the api_key
path_key = "../../key_openai/key.txt"
try:
    with open(path_key, 'r') as file:
        api_key = file.read().strip()  
except Exception as e:
    print(f"error : {e}")

In [None]:
client = openai.Client(api_key=api_key, base_url="https://fmapi.swissai.cscs.ch/")

output_file = "data/table_1951_clf.csv"

with open(output_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["job", "answer"])  

def classify_job(job, job_clf):
    if job_clf != None:
        with open(output_file, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([job, job_clf])
        return job_clf
    
    system_message = {
        "role": "system",
        "content": (
            f"You'll be given a job and you'll have to classify it according to the nearest job in a list we give you: {salaires}. Answer only with the profession in which you classify the job you are given"
        ),
    }
    answer = 'not a job'
    idx = 0 

    while answer not in salaires:
        idx += 1
        if idx == 10:
            answer = None
            break

        user_message = {
            "role": "user",
            "content": f"{job}"
        }

        res = client.chat.completions.create(
            model="meta-llama/Llama-3.3-70B-Instruct",
            messages=[system_message, user_message],
            max_tokens=40,
            stream=True,
        )

        answer = ""
        for chunk in res:
            if len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                answer += chunk.choices[0].delta.content
    with lock:
        dict_clf[job] = answer
    with open(output_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([job, answer])

    return answer

In [25]:
def process_row(idx_row_tuple):
    idx, row = idx_row_tuple
    job_clf = classify_job(row.job, row.job_clf)
    return idx, job_clf

# Créer une liste de tuples (index, row)
rows_to_process = list(df_table.iterrows())

# Utiliser ThreadPoolExecutor au lieu de ProcessPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_row, rows_to_process)
    
    # Mettre à jour le DataFrame avec les résultats
    for idx, job_clf in results:
        df_table.loc[idx, 'job_clf'] = job_clf

In [26]:
df_table.to_csv(output_file)
df_table

Unnamed: 0,LOC,job,job_clf
0,r.Ste-Beuve 7,professeur,"agences,locations,consultations"
1,Montbenon 1.T. 22,Arc (cercle),arts graphiques
2,Bereiéres 20,cuisinier,"hôtellerie,restauration"
3,",av. Tivoli 25","mont, en chauff",transports
4,ch.de Fontenay 15,télégraphiste,"agences,locations,consultations"
...,...,...,...
41818,Bugnon 34.At. ch. des Diablerets 7,tapissier,"bois,liège,meubles"
41819,av. Ouchy 33,dir. Clinique de Bois-Cerf,administration publique
41820,ch. de Mon-tolivet 14,vendeuse,commerce de détail
41821,av. Maria-Belgia 1,secrét.,bureaux de consultation
