# **Libraries importing and drive mounting**


In [1]:
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine


# **Method Definition**


### checkValue function 

This function gets an attribute of a specific table and validate its values against a given string collection that contains valid values and returns the result in a dataframe format which contains the same columns as the CellsMetadata table.

In [2]:
def checkValue(attribute, table, string_collection):
    tableName= table#get the table name 
    table= globals()[table] # convert the table name given as argument to a variable which represent the global variable table
    result= pd.DataFrame(columns=['appMethID', 'tableID','rowID', 'colID', 'timestamp', 'DQValue']) # create a dataframe named result with the same columns of CellsMetadata
    result['rowID']=table[table.columns[0]] # fill the column rowID with the row IDs
    result['DQValue']= 1*table[attribute].isin(string_collection) #check whether attribute values are validated against string_collection. Return 1 if they are valid and 0 otherwise 
    result['appMethID']=checkValue.__name__ #fill the appMethID column of the result dataframe with the name of the current function 
    result['tableID']=tableName #fill the tableID column with the name of the table given as argument
    result['colID']=attribute #fill the colID column  with the attribute name given as argument 
    result['timestamp']=datetime.now() #fill the timestamp column with the current datetime 
    return result #return the final result
  

### check_format function

In [3]:
def check_format(date_string,date_format): 
        try: 
            datetime.strptime(date_string,date_format) #If the string (first argument) and the date format (second argument)
            return 1                                         #passed to the strptime() doesn't match, we will get ValueError and it returns 0
        except ValueError:                                           #otherwise it will return 1
            return 0 
        
            

### checkFormatDate function

In [4]:
def checkFormatDate(attribute, table, date_format):
    tableName=table #get the table name
    table= globals()[table] # convert the table name given as argument to a variable which represent the global variable table
    result= pd.DataFrame(columns=['appMethID', 'tableID','rowID', 'colID', 'timestamp', 'DQValue']) # create a dataframe named result with the same columns of CellsMetadata
    result['rowID']=table[table.columns[0]] # fill the  rowID column with the row IDs
    result['DQValue']=  table[attribute][table[attribute].notna()].apply(lambda x: check_format(x, date_format) ) #fill the column DQValue by applying the check_format function to check if every attribute value, except NAN values, respects the given date format. 
    result['appMethID']=checkFormatDate.__name__ #fill the appMethID column  of the dataframe result with the name of the current function 
    result['tableID']=tableName  #fill the tableID column with the name of the table given as argument
    result['colID']=attribute #fill the colID column with the attribute name given as argument
    result['timestamp']=datetime.now() #fill the column timestamp with the current datetime 
    return result #return the final result

# **Data Loading**

In [5]:
patients = pd.read_excel('C:/imen/pfe_on/CHRU/Sources/CHRU_source.xlsx',sheet_name="Patients", dtype={'DATE_DIAG': str})

In [6]:
patients #printing the dataframe

Unnamed: 0,ID,SEXE,DDN,ACT_PROF,ACT_PROF_PREC,DIAG,AUTR_DIAG,DIAG_PROBA,DATE_DIAG,PREM_SYM,...,Détail de la cause du décès : [ DET_DCD_V ] [M6 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M8 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M10 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M14 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M17 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M18 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M19 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M20 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M21 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M30 - F0]
0,1,Féminin,23/07/1939,Retraité,Agriculteur exploitant,Non encore déterminé,,,,juil-18,...,,,,,,,,,,
1,2,Masculin,18/11/1941,,,Non encore déterminé,,,,janv-18,...,,,,,,,,,,
2,3,Masculin,27/10/1941,Retraité,Cadre d'entreprise,Non encore déterminé,,,,déc-16,...,,,,,,,,,,
3,4,Masculin,09/09/1959,"Contremaître, agent de maîtrise",,SLA,,Forme certaine,juin-17,févr-17,...,,,,,,,,,,
4,5,Masculin,27/05/1960,,,SLA,,Forme probable,mai-17,oct-15,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,1656,Féminin,18/01/1936,,,SLA,,Forme certaine,mars-06,juil-05,...,,,,,,,,,,
1656,1657,Féminin,23/01/1949,Ouvrier non qualifié,,SLA,,Forme certaine,déc-09,juil-09,...,,,,,,,,,,
1657,1658,Féminin,28/11/1947,,,SLA,,Forme probable,nov-21,avr-21,...,,,,,,,,,,
1658,1659,Féminin,20/07/1940,,,SLA,,Forme certaine,mai-21,mai-20,...,,,,,,,,,,


# **Connection to database**

In [13]:
#create an engine that will allow us to connect to the postgres database by specifying the database characteristics (user=postgres,password=enisgi1,host=localhost,port=5432,database_name=new_db)
engine = create_engine('postgresql+psycopg2://postgres:test123@localhost:5432/new_db')


# **Method Execution and result storage**

### Executing the first function and put the result in the variable result_1

In [7]:
result_1= checkValue('LIEU_DEBUT','patients', ['Bulbaire','Spinal','Respiratoire']) #you should enter the table that we will work with as a string and with the same name of the dataframe

In [8]:
result_1 #printing the result_1

Unnamed: 0,appMethID,tableID,rowID,colID,timestamp,DQValue
0,checkValue,patients,1,LIEU_DEBUT,2023-04-14 14:24:50.879260,1
1,checkValue,patients,2,LIEU_DEBUT,2023-04-14 14:24:50.879260,0
2,checkValue,patients,3,LIEU_DEBUT,2023-04-14 14:24:50.879260,0
3,checkValue,patients,4,LIEU_DEBUT,2023-04-14 14:24:50.879260,0
4,checkValue,patients,5,LIEU_DEBUT,2023-04-14 14:24:50.879260,0
...,...,...,...,...,...,...
1655,checkValue,patients,1656,LIEU_DEBUT,2023-04-14 14:24:50.879260,0
1656,checkValue,patients,1657,LIEU_DEBUT,2023-04-14 14:24:50.879260,0
1657,checkValue,patients,1658,LIEU_DEBUT,2023-04-14 14:24:50.879260,0
1658,checkValue,patients,1659,LIEU_DEBUT,2023-04-14 14:24:50.879260,1


### Storing the result of the method checkValue in CellsMetadata Table

In [14]:
result_1.to_sql('CellsMetadata',engine, if_exists='append', index=False) # insert the dataframe result_1 (the result of the first executed function)  in the database table called "CellsMetadata" 
#CellsMetadata will be created only once

### Executing the second function and put the result in the variable result_2

In [9]:
result_2= checkFormatDate('DATE_DIAG','patients','%d-%m-%Y') 



In [10]:
result_2 #printing the result_2

Unnamed: 0,appMethID,tableID,rowID,colID,timestamp,DQValue
0,checkFormatDate,patients,1,DATE_DIAG,2023-04-14 14:24:50.963224,
1,checkFormatDate,patients,2,DATE_DIAG,2023-04-14 14:24:50.963224,
2,checkFormatDate,patients,3,DATE_DIAG,2023-04-14 14:24:50.963224,
3,checkFormatDate,patients,4,DATE_DIAG,2023-04-14 14:24:50.963224,0.0
4,checkFormatDate,patients,5,DATE_DIAG,2023-04-14 14:24:50.963224,0.0
...,...,...,...,...,...,...
1655,checkFormatDate,patients,1656,DATE_DIAG,2023-04-14 14:24:50.963224,0.0
1656,checkFormatDate,patients,1657,DATE_DIAG,2023-04-14 14:24:50.963224,0.0
1657,checkFormatDate,patients,1658,DATE_DIAG,2023-04-14 14:24:50.963224,0.0
1658,checkFormatDate,patients,1659,DATE_DIAG,2023-04-14 14:24:50.963224,0.0


### Storing the result of the method checkFormatDate in CellsMetadata Table

In [15]:
result_2.to_sql('CellsMetadata',engine, if_exists='append', index=False) # insert the dataframe result_2 (the result of the second executed function) in the database table called "CellsMetadata" 