# Library imports, configurations

In [1]:
import os
import re
import s3fs
import pandas as pd
import requests
import json
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
from unidecode import unidecode
from wordcloud import WordCloud
import nltk
nltk.download('punkt')

from stop_words import get_stop_words
import unicodedata

from nltk.tokenize import word_tokenize
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import connections
from bs4 import BeautifulSoup as bs

[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [32]:
BUCKET = "jplaton/diffusion"
fs.ls(BUCKET)

['jplaton/diffusion/.keep',
 'jplaton/diffusion/ted',
 'jplaton/diffusion/visio_mel']

In [4]:
BUCKET_CLEM = "clementg/diffusion"
fs.ls(BUCKET_CLEM)

['clementg/diffusion/.keep',
 'clementg/diffusion/Fiche de Poste DINUM - francetransfert-3595970061- pdf.csv',
 "clementg/diffusion/Référentiel de l'organisation administrative de l'Etat.csv"]

# Functions

In [5]:
# Suppression des accents

def remove_accent(s):

    s = unidecode(str(s))
    return(s)

# Suppression de la ponctuation

def remove_punctuation(s):
    # On retire la ponctuation
 
    s = re.sub(r'[^A-Za-z0-9]',' ',str(s))
    return(s)

# Réduction de la casse

def tolower(s):
    s = ' '.join(str(s).lower().split())
    return(s)


# Sur cette base, on crée une petite fonction qui retire les stop words
stopwords = get_stop_words('french')   
stopwords = [unicodedata.normalize('NFKD',m).encode('ASCII', 'ignore').decode() for m in stopwords]

def remove_stopwords(s):
    s = [w for w in word_tokenize(s) if w not in stopwords]
    s = ' '.join(s)
    return(s)

def remove_accent_from_df(df,list_cols):
    df_c =  df.copy()
    df_c[list_cols] = df_c[list_cols].map(remove_accent)
    return df_c

def remove_punctuation_from_df(df,list_cols):
    df_c =  df.copy()
    df_c[list_cols] = df_c[list_cols].map(remove_punctuation)
    return df_c

def tolower_df(df,list_cols):
    df_c =  df.copy()
    df_c[list_cols] = df_c[list_cols].map(tolower)
    return df_c

# Scraping de annuaire service public pour aller chercher les services à aprtir des ids
# def get_service_from_scraping(service_id):
#     try:
#         url=f'https://lannuaire.service-public.fr/gouvernement/{service_id}'
#         response = requests.get(url)
#         html = response.content
#         soup = bs(html, "lxml")
#         return soup.title.get_text().replace(' - Annuaire | Service-public.fr', '')
#     except:
#         return ''

# DATA Download

In [35]:
FILE_PATH_TED = "ted"
FILE_PATH_TED_S3 = BUCKET + "/" + FILE_PATH_TED

with fs.open(FILE_PATH_TED_S3+"/offres_num.json", mode="rb") as file_in:  
    offres =  pd.read_json(file_in) #json.load(file_in)

# with fs.open(FILE_PATH_TED_S3+"/offres-datagouv-20230409.csv", mode="rb") as file_in:
#     offres_2023_df = pd.read_csv(file_in, sep=";")

# with fs.open(FILE_PATH_TED_S3+"/offres-datagouv-20221225.csv", mode="rb") as file_in:   
#     offres_2022_df = pd.read_csv(file_in, sep=";")

# with fs.open(FILE_PATH_TED_S3+"/Bulloterie_10_Saison_2.xlsx", mode="rb") as file_in:  
#     bulloterie_df =  pd.read_excel(file_in, sheet_name=0,header=[0,1])


In [6]:
with fs.open(BUCKET_CLEM+"/Référentiel de l'organisation administrative de l'Etat.csv", mode="rb") as file_in:
    orga_csv_df = pd.read_csv(file_in, sep=";",encoding='utf-8').rename(columns={'sigle (construction)': 'sigle_const','service_parent.sigle (construction)':'service_parent_sigle_const'})

# with fs.open(BUCKET_CLEM+"/Fiche de Poste DINUM - francetransfert-3595970061- pdf.csv", mode="rb") as file_in:
#     fdp_dinum_df = pd.read_csv(file_in, sep=";",encoding='Windows-1252')
# # orga_sigle = orga_csv_df[['id','sigle (construction avec le nom)']].rename(columns={'sigle (construction avec le nom)':'sigle_calcul'})
# with fs.open(FILE_PATH_TED_S3+"/dila_refOrga_admin_Etat_fr_20230505.json", mode="rb") as file_in:  
#     orga =  json.load(file_in)
# orga_df['arbo_service_id'] = ''
# for k,v in ref_pere_fils.items():
#    orga_df.loc[(orga_df['id'].isin(v))|(orga_df['id']==k),'arbo_service_id'] = orga_df.loc[(orga_df['id'].isin(v))|(orga_df['id']==k),'arbo_service_id'].apply(lambda x: [k]+v)

  orga_csv_df = pd.read_csv(file_in, sep=";",encoding='utf-8').rename(columns={'sigle (construction)': 'sigle_const','service_parent.sigle (construction)':'service_parent_sigle_const'})


In [18]:
orga_csv_df.dropna(axis = 0, how = 'all', inplace = True)

### Traitement référentiel services

In [20]:
ref_service_id_nom = orga_csv_df.set_index('id').to_dict()['nom']

In [21]:
ref_service_id_sigle = orga_csv_df.set_index('id').to_dict()['sigle_const']

In [22]:
services_clean = ( orga_csv_df
    .pipe(remove_accent_from_df,['nom','sigle_const'])
    .pipe(remove_punctuation_from_df,['nom','sigle_const'])
    .pipe(tolower_df,['nom','sigle_const'])
)

In [23]:
services_clean = services_clean[['id','nom','sigle_const']] 

### Traitement offres

In [37]:
offres.columns = offres.columns.map(remove_accent).map(remove_punctuation).map(tolower).map(lambda s: s.replace(' ', '_'))

In [41]:
offres_clean = ( offres
    .pipe(remove_accent_from_df,['titre_du_poste','employeur','information_mission','profil_recherche','metier_de_reference'])
    .pipe(remove_punctuation_from_df,['titre_du_poste','employeur','information_mission','profil_recherche','metier_de_reference'])
    .pipe(tolower_df,['titre_du_poste','employeur','information_mission','profil_recherche','metier_de_reference'])
)

In [43]:
offres_clean = offres_clean[['titre_du_poste','employeur','information_mission','profil_recherche','metier_de_reference']]

# Insertion des données dans Elastic Search

In [44]:
# Paramétrage du client
es_client = connections.create_connection(hosts=['http://elasticsearch-master:9200/'])

In [45]:
# Fontion qui transforme un dataframe en document/index à stocker dans ElasticSearch
def doc_generator(df,index_name):
    df_iter = df.iterrows()
    for index, document in df_iter:
        yield {
                "_index": index_name,
                "_source": document,
            }

In [29]:
# Indexation des services
helpers.bulk(es_client, doc_generator(services_clean,"services"))

  helpers.bulk(es_client, doc_generator(services_clean,"services"))


(10913, [])

In [89]:
def get_services(es,query,nb_results):
    fullsearch = es.search(index = "services", # l'index dans lequel on cherche
                       q = query, # notre requête textuelle
                       size = nb_results)
    res = [fullsearch['hits']['hits'][x]['_source']['id'] for x in range(min(nb_results,len(fullsearch['hits']['hits'])))]
    score = [fullsearch['hits']['hits'][x]['_score'] for x in range(min(nb_results,len(fullsearch['hits']['hits'])))]
    return res, score

In [90]:
def add_services(df, list_cols,es,nb_results):
    df['service_id'] = df[list_cols].apply(lambda row: get_services(es,' '.join(row.values.astype(str)),nb_results)[0], axis=1)
    df['scores'] = df[list_cols].apply(lambda row: get_services(es,' '.join(row.values.astype(str)),nb_results)[1], axis=1)
    return df

In [91]:
list_cols = ['employeur']
offres_with_services = offres_clean.pipe(add_services,list_cols,es_client,3)
offres_with_services['service_nom'] = offres_with_services['service_id'].map(lambda x: [ref_service_id_nom[n] for n in x if n in ref_service_id_nom.keys()])

  fullsearch = es.search(index = "services", # l'index dans lequel on cherche
  fullsearch = es.search(index = "services", # l'index dans lequel on cherche


In [92]:
offres_with_services.to_csv('offres_with_services.csv',index=False,sep=';',encoding='utf-8')

In [101]:
temp = offres_with_services[['service_id','titre_du_poste','employeur','information_mission','profil_recherche','metier_de_reference']]
temp['mots_cles'] = temp[['information_mission','profil_recherche','metier_de_reference']].apply(lambda row: ' '.join(row.values.astype(str)),axis=1)
temp = (temp.drop(['employeur','information_mission','profil_recherche','metier_de_reference'],axis=1)
        .explode('service_id')
        .groupby('service_id')
        .agg({'mots_cles': lambda x: ' '.join(x),'titre_du_poste': lambda x : list(x)})
        .reset_index()
        .rename(columns={'service_id':'id','titre_du_poste':'liste_postes'})
       )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['mots_cles'] = temp[['information_mission','profil_recherche','metier_de_reference']].apply(lambda row: ' '.join(row.values.astype(str)),axis=1)


In [96]:
services_with_offres = pd.merge(orga_csv_df,
                               temp,on='id',how='left')

In [97]:
services_with_offres.to_csv('services_with_offres.csv',index=False,sep=';',encoding='utf-8')

In [57]:
# Indexation des services enrichies des offres
helpers.bulk(es_client, doc_generator(services_with_offres.fillna(''),"services_with_offres_ds_50"))

### Autres mode de recherche

In [55]:
from elasticsearch_dsl import Search

 # Recherche dans l'ensemble des champs le meilleur écho (le plus pertinent) #
fullsearch_easy = es_client.search(index = "services", # l'index dans lequel on cherche
                       q = "employeur secretariat general dir sg grand centre ministere de la justice secretariat general affectation delegation inter regionale grand centre dir sg gc departement informatique et telecommunications dit de dijon", # notre requête textuelle
                              size = 1) # taille de l'ensemble les échos souhaités


  fullsearch_easy = es_client.search(index = "services", # l'index dans lequel on cherche


In [86]:
fullsearch_easy['hits']['hits'][0]['_score']

40.246254

In [None]:
 # Recherche dans l'ensemble des champs le meilleur écho (le plus pertinent) #
fullsearch_scientist = es_client.search(index = "services", # l'index dans lequel on cherche
                       q = "centre hospitalier universitaire de tours	data scientist datascientist centre de donnees cliniques", # notre requête textuelle
                              size = 1) # taille de l'ensemble les échos souhaités
 # Recherche dans l'ensemble des champs le meilleur écho (le plus pertinent) #
fullsearch_engineer= es_client.search(index = "services", # l'index dans lequel on cherche
                       q = "service hydrographique et oceanographique de la marine	data engineer	ingenieur en traitement de l information conception developpement et experimentation d ia h f", # notre requête textuelle
                              size = 1) # taille de l'ensemble les échos souhaités
fullsearch_designer = es_client.search(index = "services", # l'index dans lequel on cherche
                       q = "caisse des depots et consignations	architecte technique expert conception et architecture de donnees d entreprise data designer h f	", # notre requête textuelle
                              size = 1) # taille de l'ensemble les échos souhaités

### Traitement Fiche de poste DNUM

In [14]:
# Traitement des colonnes
fdp_dinum_df.columns = fdp_dinum_df.columns.map(remove_accent).map(remove_punctuation).map(tolower).map(lambda s: s.replace(' ', '_'))
fdp_clean = ( fdp_dinum_df
    .pipe(remove_accent_from_df,['poste','intitule_du_service_demandeur_bureau_section','corps_grade','presentation','missions','competences'])
    .pipe(remove_punctuation_from_df,['poste','intitule_du_service_demandeur_bureau_section','corps_grade','presentation','missions','competences'])
    .pipe(tolower_df,['poste','intitule_du_service_demandeur_bureau_section','corps_grade','presentation','missions','competences'])
)
fdp_clean = fdp_clean[['poste','intitule_du_service_demandeur_bureau_section','corps_grade','presentation','missions','competences']] #fdp_clean.head(2)
