In [1]:
import os
os.chdir("..")

# Préparation 

## Imports

In [2]:
import pandas as pd

pd.options.display.max_columns = 100

In [3]:
from src.utils import get_all_schema_path
from tableschema import Schema

## Lecture et mise en forme

In [4]:
csv_files = [
    "Variables_PMSI_HAD_181214170822.csv",
"Variables_PMSI_MCO_181214170820.csv",
"Variables_PMSI_PSY_181214170824.csv",
"Variables_PMSI_SSR_181214170823.csv"
]

In [44]:
df_list = []
for csv_name in csv_files:
    df = pd.read_csv("notebooks/Envoi_Dico Alimentation/" + csv_name, 
                 encoding="latin1", sep=';', dtype='str')
    columns = list(df.columns)
    df["produit"] = csv_name[10:18]
    
    df = df[["produit"] + columns]
    df_list.append(df)

df = pd.concat(df_list)

In [45]:
df.columns = ['produit', 
              'name_table', 
              'title_table', 
              'name', 
              'type', 
              'length',
              'description',
              "variable_jointure",
              "date_created",
              "date_deleted",
              "observation_variable",
              'regle_gestion', 
              'famille_concept', 
              "date_created_table",
              "date_deleted_table",
              "sensible_ou_medicale"
             ]

In [46]:
df.head(2)

Unnamed: 0,produit,name_table,title_table,name,type,length,description,variable_jointure,date_created,date_deleted,observation_variable,regle_gestion,famille_concept,date_created_table,date_deleted_table,sensible_ou_medicale
0,PMSI_HAD,T_HADaaA,Table des actes,ACT_COD,Caractère,1,Code de l'activité,,2010,,,,,2010,,
1,PMSI_HAD,T_HADaaA,Table des actes,CCAM_COD,Caractère,7,Code de l'acte CCAM (hors extension PMSI),,2010,,,,,2010,,


In [47]:
df.duplicated().sum()

6

In [48]:
df = df.drop_duplicates()

## Standardize values

In [50]:
def map_with_mapping(s):
    if s in mapping:
        return mapping[s]
    return s

### fillna et strip


In [51]:
df = df.fillna("")

In [52]:
for column in df.columns:
    df[column] = df[column].str.strip()

### name_table

In [53]:
df.name_table = df.name_table.str.upper()

### produit

In [54]:
df.produit.value_counts()

PMSI_MCO    1178
PMSI_SSR     608
PMSI_HAD     539
PMSI_PSY     372
Name: produit, dtype: int64

In [55]:
mapping = {
    'PMSI_MCO': 'PMSI MCO',
    'PMSI_SSR' : 'PMSI SSR',
    "PMSI_HAD" : "PMSI HAD",
    "PMSI_PSY" : "PMSI RIM-P"
}
df.produit = df.produit.map(lambda s: mapping[s] if s in mapping else s)
df.produit.value_counts()

PMSI MCO      1178
PMSI SSR       608
PMSI HAD       539
PMSI RIM-P     372
Name: produit, dtype: int64

### type

In [57]:
df.type.value_counts()

Caractère    1887
Numérique     774
Date           20
               12
FAUX            2
date            2
Name: type, dtype: int64

In [58]:
df[df.type == "FAUX"]

Unnamed: 0,produit,name_table,title_table,name,type,length,description,variable_jointure,date_created,date_deleted,observation_variable,regle_gestion,famille_concept,date_created_table,date_deleted_table,sensible_ou_medicale
99,PMSI RIM-P,T_RIPAAC,chainage,EXE_SOI_DTD,FAUX,0,date d'entrée,,2009,,,,,2007,,
100,PMSI RIM-P,T_RIPAAC,chainage,EXE_SOI_DTF,FAUX,0,date de sortie,,2009,,,,,2007,,


In [59]:
mapping = {
    "Numérique": "number",
    "Caractère": "string",
    "Date": "date",
    "numérique": "number",
    "caractère": "string",
    "VARCHAR2": "string",
    "Varchar": "string",
    "date": "date",
    "FAUX": ""
}
df.type = df.type.map(lambda s: mapping[s] if s in mapping else s)

df.type.value_counts()

string    1887
number     774
date        22
            14
Name: type, dtype: int64

### lenght

In [60]:
mask = (df.length.str.contains('\.') | df.length.str.contains('\,'))
df[mask].head(2)

Unnamed: 0,produit,name_table,title_table,name,type,length,description,variable_jointure,date_created,date_deleted,observation_variable,regle_gestion,famille_concept,date_created_table,date_deleted_table,sensible_ou_medicale
144,PMSI MCO,T_MCOAADMIP,Fich comp dmi en sus,NBR_POS_PRI,number,103,Prix d'achat multiplié par le nombre posé,NON,2008,,,,,2008,,non
164,PMSI MCO,T_MCOAAFA,OQN Entete facture,HON_AM_MNR,number,82,Total honoraire remboursable AM,NON,2005,,Observations 2017 : Attention les montants son...,Utilisable par ETA_NUM||RSA_NUM,,2005,,non


In [61]:
df.length = df.length.str.replace('.', ',').str.strip(',')

## Create descriptor

In [79]:
df[df.name_table.str.startswith("T_SUP")]

Unnamed: 0,produit,name_table,title_table,name,type,length,description,variable_jointure,date_created,date_deleted,observation_variable,regle_gestion,famille_concept,date_created_table,date_deleted_table,sensible_ou_medicale
1086,PMSI MCO,T_SUPAAALD,Alternatives à la dialyse (extension de fichie...,ANN_MOI,date,,Mois et année,,2007,2009,,,,2007,2009,
1087,PMSI MCO,T_SUPAAALD,Alternatives à la dialyse (extension de fichie...,ETA_NUM,string,9.0,Numéro FINESS e-PMSI,,2007,2009,,,,2007,2009,
1088,PMSI MCO,T_SUPAAALD,Alternatives à la dialyse (extension de fichie...,FIC_TYP,number,3.0,Type de fichier,,2007,2009,,,,2007,2009,
1089,PMSI MCO,T_SUPAAALD,Alternatives à la dialyse (extension de fichie...,NBR_REA_EAL,number,8.0,Nombre réalisé en activité libérale,,2007,2009,,,,2007,2009,
1090,PMSI MCO,T_SUPAAALD,Alternatives à la dialyse (extension de fichie...,NBR_REA_HAL,number,8.0,Nombre réalisé hors activité libérale,,2007,2009,,,,2007,2009,
1091,PMSI MCO,T_SUPAAALD,Alternatives à la dialyse (extension de fichie...,PRS_COD,string,9.0,Code Prestation,,2007,2009,,,,2007,2009,
1092,PMSI MCO,T_SUPAAATU,Passage aux urgences (extension de fichier .atu),ANN_MOI,date,,Mois et année,NON,2007,2009,,,,2007,2009,non
1093,PMSI MCO,T_SUPAAATU,Passage aux urgences (extension de fichier .atu),AUT_TYP,string,6.0,Type d'autorisation,NON,2007,2009,Observations 2011 : Code : AUT_TYP : POSU >>P...,,,2007,2009,non
1094,PMSI MCO,T_SUPAAATU,Passage aux urgences (extension de fichier .atu),ETA_NUM,string,9.0,Numéro FINESS e-PMSI,OUI,2007,2009,,,,2007,2009,non
1095,PMSI MCO,T_SUPAAATU,Passage aux urgences (extension de fichier .atu),ETA_NUM_GEO,string,9.0,Premier numéro FINESS géographique,NON,2007,2009,,,,2007,2009,non


In [77]:
df.name_table.str[:7].value_counts()

T_MCOAA    1126
T_SSRAA     608
T_HADAA     539
T_RIPAA     372
T_SUPAA      52
Name: name_table, dtype: int64

In [80]:
df.name_table.str[:5] + 'aa_nn'

0      T_HAD
1      T_HAD
2      T_HAD
3      T_HAD
4      T_HAD
5      T_HAD
6      T_HAD
7      T_HAD
8      T_HAD
9      T_HAD
10     T_HAD
11     T_HAD
12     T_HAD
13     T_HAD
14     T_HAD
15     T_HAD
16     T_HAD
17     T_HAD
18     T_HAD
19     T_HAD
20     T_HAD
21     T_HAD
22     T_HAD
23     T_HAD
24     T_HAD
25     T_HAD
26     T_HAD
27     T_HAD
28     T_HAD
29     T_HAD
       ...  
578    T_SSR
579    T_SSR
580    T_SSR
581    T_SSR
582    T_SSR
583    T_SSR
584    T_SSR
585    T_SSR
586    T_SSR
587    T_SSR
588    T_SSR
589    T_SSR
590    T_SSR
591    T_SSR
592    T_SSR
593    T_SSR
594    T_SSR
595    T_SSR
596    T_SSR
597    T_SSR
598    T_SSR
599    T_SSR
600    T_SSR
601    T_SSR
602    T_SSR
603    T_SSR
604    T_SSR
605    T_SSR
606    T_SSR
607    T_SSR
Name: name_table, Length: 2697, dtype: object

In [74]:
for i, (produit, name_table) in df[["produit", 'name_table']].drop_duplicates().iterrows():
    # Restriction table
    sdf = df[(df.produit == produit) & (df.name_table == name_table)]
    
    assert sdf.title_table.nunique() == 1
    title = sdf.title_table.unique()[0]

   
    # Lecture schema
    schema_path = "schemas/PMSI/{}/{}.json".format(produit, name_table)
    print(schema_path)
    assert os.path.exists(schema_path)
    schema = Schema(schema_path)

    
    # MAJ table
    schema.descriptor.update({
        "name": name_table, 
        "title": title, 
        "produit": produit
    })
    
    
    # Différences d'ensembles de variables
    schema_names = set([name.upper() for name in schema.field_names]) 
    dico_names = set(sdf.name)
    set_schema_dico = schema_names - dico_names
    set_dico_schema = dico_names - schema_names
      

    if set_schema_dico or set_dico_schema:
        print()
        print(produit, name_table, title) 
        if set_dico_schema:
            for field in set_dico_schema:
                print("- Variables absente dans le schéma", field)
    
    # MAJ variables
    for field in schema.descriptor['fields']:
        name = field['name']
        if not name.upper() in dico_names:
            print('- Variable absente dans le dico cnam', name)
            continue
        
        columns_to_update = ['description', 'type', 'nomenclature', 'length']
        columns_to_update = ['nomenclature']
        record = sdf[sdf.name == name.upper()][columns_to_update].to_dict(orient="records")[0]
        for key in columns_to_update:
            if record[key] == "":
                record.pop(key)
        field.update(record)
    
    try:
        schema.commit(strict=True)
    except Exception as e:
        print(e.errors)
        raise e
    
    schema.save(schema_path, ensure_ascii=False)

schemas/PMSI/PMSI HAD/T_HADAAA.json


AssertionError: 