In [1]:
import openai
import pandas as pd
import numpy as np
import ast
import csv
import concurrent.futures
import os
import threading

lock = threading.Lock()

In [2]:
salaires = ["produits alimentaires","vêtements, lingerie,chaussures,literies", "industrie textile", "industrie du papier", 
            "arts graphiques", "industrie chimique", "bois,liège,meubles", "pierre et terre", "industrie des métaux",  
            "horlogerie","bijouterie,gravure,frappe","construction,charpenterie","gaz,eau,éléctricité","commerce de gros",
            "commerce de détail","banques,établissements financiers","assurances privées","agences,locations,consultations",
            "affaires immobilières,location","bureaux de consultation","hôtellerie,restauration","transports","administration publique",
            "réparations", "vachers célibataires","employés pour tous,célibataires","employées pour le ménage et la ferme",
            "journaliers,entretien","journalières,entretien"]

In [3]:
df_job_mapped = pd.read_csv('classification_results.csv').rename(columns={"answer": 'job_clf'})
dfs_clf = [pd.read_csv(os.path.join('data', df_clf)).rename(columns={'JOB': 'job', 'answer': 'job_clf'}) for df_clf in os.listdir('data') if df_clf.endswith('_clf.csv')]
dfs_clf += [df_job_mapped]

In [4]:
# get every job already classified, so if we encounter this job again, we dont have to recompute its classification
def create_dict_job_clf(list_df):
    dict_clf = {}
    for df in list_df:
        for _, row in df.iterrows():
            job, job_clf = row.job, row.job_clf
            dict_clf[job] = job_clf
    
    return dict_clf
dict_clf = create_dict_job_clf(dfs_clf)

In [6]:
def load_df(path_annuaire):
    """_summary_

    Args:
        path_annuaire (str): patht to the annuaire

    Returns:
        pd.DataFrame: a DataFrame cleaned
    """
    df = pd.read_csv(path_annuaire).rename(columns={'JOB':'job'})
    df = df.dropna().reset_index(drop=True)
    return df
df_table = load_df('data/table_1901.csv')

In [7]:
def classify_job_already_classified(df):
    """_summary_
        En gros, avec category_mapping on a deja classifier des métiers, au lieu
        de reclassifier avec le LLM qui est cher et long, on map les métiers deja classifier
        
    """
    df['job_clf'] = df.job.apply(lambda x: dict_clf.get(x, None))
    return df.copy(deep=True)


In [8]:
df_table = classify_job_already_classified(df_table)

In [9]:
#Load the api_key
path_key = "../../key_openai/key.txt"
try:
    with open(path_key, 'r') as file:
        api_key = file.read().strip()  
except Exception as e:
    print(f"error : {e}")

In [10]:
client = openai.Client(api_key=api_key, base_url="https://fmapi.swissai.cscs.ch/")

output_file = "data/table_1901_clf.csv"

with open(output_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["job", "answer"])  

def classify_job(job, job_clf):
    if job_clf != None:
        with open(output_file, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([job, job_clf])
        return job_clf
    
    system_message = {
        "role": "system",
        "content": (
            f"You'll be given a job and you'll have to classify it according to the nearest job in a list we give you: {salaires}. Answer only with the profession in which you classify the job you are given"
        ),
    }
    answer = 'not a job'
    idx = 0 

    while answer not in salaires:
        idx += 1
        if idx == 10:
            answer = None
            break

        user_message = {
            "role": "user",
            "content": f"{job}"
        }

        res = client.chat.completions.create(
            model="meta-llama/Llama-3.3-70B-Instruct",
            messages=[system_message, user_message],
            max_tokens=40,
            stream=True,
        )

        answer = ""
        for chunk in res:
            if len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                answer += chunk.choices[0].delta.content
    with lock:
        dict_clf[job] = answer
    with open(output_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([job, answer])

    return answer

In [11]:
df_table

Unnamed: 0,LOC,job,job_clf
0,"r. SIRI erre, 2",maréchal,réparations
1,"La Gaîté,av. Muntriund",commis de banque,"banques,établissements financiers"
2,"r. de la Tour, 16",charretier,transports
3,"villa Charles, Montagibert","manœuvre,","journaliers,entretien"
4,"maison Rcga-mey, La Sallaz.",maçon,"construction,charpenterie"
...,...,...,...
13495,"r. St-Laurent, 7",relieur,industrie du papier
13496,"av. Simplon, 33",concierge,employées pour le ménage et la ferme
13497,ruedes Glaciers,Rochers de Nave,
13498,"Industrie, 11",boulanger,produits alimentaires


In [None]:
def process_row(idx_row_tuple):
    idx, row = idx_row_tuple
    job_clf = classify_job(row.job, row.job_clf)
    return idx, job_clf

# Créer une liste de tuples (index, row)
rows_to_process = list(df_table.iterrows())

# Utiliser ThreadPoolExecutor au lieu de ProcessPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_row, rows_to_process)
    
    # Mettre à jour le DataFrame avec les résultats
    for idx, job_clf in results:
        df_table.loc[idx, 'job_clf'] = job_clf

In [None]:
df_table.to_csv(output_file)
df_table