In [29]:
import openai
import pandas as pd
import numpy as np
import ast
import csv
import concurrent.futures

In [30]:
df_job_mapped = pd.read_csv('classification_results.csv')

In [None]:
def load_df(path_annuaire):
    """_summary_

    Args:
        path_annuaire (str): patht to the annuaire

    Returns:
        pd.DataFrame: a DataFrame cleaned
    """
    df = pd.read_excel(path_annuaire, index_col=0)
    df.JOB = df.JOB.apply(ast.literal_eval).apply(lambda x: x[0] if len(x) > 0 else None)
    df = df.dropna(subset=['JOB']).reset_index(drop=True)
    return df
df_table = load_df("data/table_1923_v3-1_geoloc.xlsx")

In [32]:
def classify_job_already_classified(df):
    """_summary_
        En gros, avec category_mapping on a deja classifier des métiers, au lieu
        de reclassifier avec le LLM qui est cher et long, on map les métiers deja classifier
        
    """
    df['job_clf'] = df.JOB.apply(lambda x: df_job_mapped[df_job_mapped['job'] == x].iloc[0, 1] if x in df_job_mapped['job'].to_numpy() else None)
    return df.copy(deep=True)


In [None]:
df_table = classify_job_already_classified(df_table)
df_table

Unnamed: 0,LASTNAME,FIRSTNAME,ORG,JOB,LOC,LOCJOB,CONDITION,line,LOC_pt,LOC_addr,LOCJOB_pt,LOCJOB_addr,job_clf
0,['Abbondioli'],['Célestin'],[],maçon,[],[],[],"['Abbondioli Célestin, maçon, Fraisse 8']",,,,,"construction,charpenterie"
1,['Abbondioli'],['Elvézia'],[],margeuse,['Maupas 23bis'],[],[],"['– Elvézia, margeuse, Maupas 23bis.']","(46.5254199, 6.621479799999999)","Av. de France 23 Bis, 1004 Lausanne, Switzerland",,,industrie textile
2,['Abbondioli-Pahud'],['Jos.'],[],maçon,['Martheray 34'],[],[],"['Abbondioli-Pahud Jos., maçon, Martheray 34.']",,,,,"construction,charpenterie"
3,['Abbondioli-Perrin'],['Quinto'],[],maçon,['rue du Lac 14'],[],[],"['Abbondioli-Perrin Quinto, maçon, rue du Lac ...","(46.5074873, 6.6257717)","Rue du Lac 14, 1007 Lausanne, Switzerland",,,"construction,charpenterie"
4,['Abbondioli-Perrin'],['Bertha'],[],mén.,['Ch.-de-Bourg 21'],[],[],"['– Bertha, mén., Ch.-de-Bourg 21.']","(46.5201595, 6.6349285)","Rue de Bourg 21, 1003 Lausanne, Switzerland",,,employées pour le ménage et la ferme
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24022,['Zweigart-Reymondin'],['Wilhelm-L.'],[],automobiles,"['pl. Chauderon 26', 'Garage Retraites 6', 'r....",[],[],"['Zweigart-Reymondin Wilhelm-L., automobiles, ...","(46.5237969, 6.6241783)","Pl. Chauderon 26, 1003 Lausanne, Switzerland",,,
24023,['Zweigart-Rubin'],['André'],[],man.,['St-Laurent 33'],[],[],"['Zweigart-Rubin André, man., St-Laurent 33.']","(46.5229104, 6.629782800000001)","Rue Saint-Laurent 33, 1003 Lausanne, Switzerland",,,"employés pour tous,célibataires"
24024,['Zweilli-Kurth'],['Walther'],[],serrur.,"['Ch. de la Fontaine', 'Malley']",[],[],"['Zweilli-Kurth Walther, serrur., Ch. de la Fo...","(46.5245249, 6.6152223)","Sébeillon / Malley, 1004 Lausanne, Switzerland",,,
24025,['Zwicky-Recordon'],['Emile'],[],pharm.,['Petit-Chêne 26'],[],[],"['Zwicky-Recordon Emile, pharm., Petit-Chêne 2...","(46.5187855, 6.631149799999999)","Rue du Petit-Chêne 26, 1003 Lausanne, Switzerland",,,


In [34]:
salaires = ["produits alimentaires","vêtements, lingerie,chaussures,literies", "industrie textile", "industrie du papier", 
            "arts graphiques", "industrie chimique", "bois,liège,meubles", "pierre et terre", "industrie des métaux",  
            "horlogerie","bijouterie,gravure,frappe","construction,charpenterie","gaz,eau,éléctricité","commerce de gros",
            "commerce de détail","banques,établissements financiers","assurances privées","agences,locations,consultations",
            "affaires immobilières,location","bureaux de consultation","hôtellerie,restauration","transports","administration publique",
            "réparations", "vachers célibataires","employés pour tous,célibataires","employées pour le ménage et la ferme",
            "journaliers,entretien","journalières,entretien"]

In [35]:
#Load the api_key
path_key = "../key_openai/key.txt"
try:
    with open(path_key, 'r') as file:
        api_key = file.read().strip()  
except Exception as e:
    print(f"error : {e}")

In [43]:
client = openai.Client(api_key=api_key, base_url="https://fmapi.swissai.cscs.ch/")

output_file = "classification_results.csv"

with open(output_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["job", "answer"])  

def classify_job(job, job_clf):
    if job_clf != None:
        return job_clf
    
    system_message = {
        "role": "system",
        "content": (
            f"You'll be given a job and you'll have to classify it according to the nearest job in a list we give you: {salaires}. Answer only with the profession in which you classify the job you are given"
        ),
    }
    answer = 'not a job'
    idx = 0 

    while answer not in salaires:
        idx += 1
        if idx == 10:
            answer = None
            break

        user_message = {
            "role": "user",
            "content": f"{job}"
        }

        res = client.chat.completions.create(
            model="meta-llama/Llama-3.3-70B-Instruct",
            messages=[system_message, user_message],
            max_tokens=40,
            stream=True,
        )

        answer = ""
        for chunk in res:
            if len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                answer += chunk.choices[0].delta.content

    with open(output_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([job, answer])

    return answer



In [None]:
import concurrent.futures

def process_row(idx_row_tuple):
    idx, row = idx_row_tuple
    job_clf = classify_job(row.JOB, row.job_clf)
    return idx, job_clf

# Créer une liste de tuples (index, row)
rows_to_process = list(df_table.iterrows())

# Utiliser ThreadPoolExecutor au lieu de ProcessPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_row, rows_to_process)
    
    # Mettre à jour le DataFrame avec les résultats
    for idx, job_clf in results:
        df_table.loc[idx, 'job_clf'] = job_clf