# **Libraries importing**


In [1]:
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine
import numpy as np
import re 



# **Method Definition**


### checkValueInCollection function 

This function checks if values of an attribute are contained in a given collection (which lists all acceptable values).

Input :
   - attribute : an attribute name
   - table : a table name; the one containing the input attribute
   - BD : a dictionnary containing the table name as key and the variable that contains the dataframe (that represents a table             in excel file) as a value of the key.
   - string collection : a collection of strings listing all acceptable values for the input attribute
   
Output : The list of quality values (and identifiers), for each cell of the input attribute (granularity: cell) organized in a dataframe with the following colums :
   - rowid: identifier of the row in the input table, i.e. the value of the third argument
   - colID: identifier of the column in the input table, i.e. the value of the second argument
   - tableID: table name which represents the excel sheet that we worked with, i.e. the value of the third argument
   - appMethID: represents the function name
   - timestamp: represents the time at which we executed the function 
   - DQValue: represents a boolean value, which is the result of applying the algorithm 
    


In [2]:
def checkValueInCollection(BD,attributeName, tableName, stringCollection):
    
    #retrieve the dataframe that represents the tableName given as an argument
    table= BD[tableName] 
    
    # create a dataframe named result with the same columns of CellsMetadata
    result= pd.DataFrame(columns=['appMethID', 'tableID','rowID', 'colID', 'timestamp', 'DQValue']) 
    
    #calculate quality value
    result['DQValue'] = 1*table[attributeName].isin(stringCollection) #check whether attribute values are validated against string_collection. 
                                                                      #Return 1 if they are valid and 0 otherwise 
                                                                      #multiplying table[attributeName].isin(stringCollection) to convert True/False to 1/0 
                                                                      #because la méthode isin() returns True/False
    #set identifiers, method and timestamp
    result['tableID'] = tableName #fill the tableID column with the name of the table given as argument
    result['colID'] = attributeName #fill the colID column  with the attribute name given as argument 
    result['rowID'] = table[table.columns[0]] # fill the column rowID with the row IDs   
    result['appMethID']= checkValueInCollection.__name__ #fill the appMethID column of the result dataframe with the name of the current function 
    result['timestamp']= datetime.now() #fill the timestamp column with the current datetime 
    
    return result #return the final result which is a dataframe
  

### checkDateFormat function

This function checks if every value of a date attribute of a given table respects the date format provided as argument or not.

Input :
   - attribute : an attribute name
   - table : a table name; the one containing the input attribute
   - BD : a dictionnary containing the table name as key and the variable that contains the dataframe (that represents a table             in excel file) as a value of the key.
   - dateFormat: a string containing a format of date that we will verify our dates against.
   
Output : The list of quality values (and identifiers), for each cell of the input attribute (granularity: cell) organized in a dataframe with the following columns :
   - rowid: identifier of the row in the input table, i.e. the value of the third argument
   - colID: identifier of the column in the input table, i.e. the value of the second argument
   - tableID: table name which represents the excel sheet that we worked with, i.e. the value of the third argument
   - appMethID: represents the function name
   - timestamp: represents the time at which we executed the function 
   - DQValue: represents a boolean value, which is the result of applying the algorithm 

In [25]:
def checkDateFormat(BD,attributeName, tableName, dateFormat):
    
    #Here below an intern function checkFormat that has two inputs a date in any string format and a specific dateFormat to check if the given string respects the date format provided as argument returns 1 in this case and 0 otherwise.
    def checkFormat(dateString,dateFormat): 
        try: 
            datetime.strptime(dateString, dateFormat) #If the string (first argument) and the date format (second argument)
            return 1                                         #passed to the strptime() doesn't match, we will get ValueError and it returns 0
        except ValueError:                                           #otherwise it will return 1
            return 0 
        
    #retrieve the dataframe that represents the tableName given as an argument
    table = BD[tableName]  
    
    #create the schemas of the resulting dataframe
    result = pd.DataFrame(columns=['appMethID', 'tableID','rowID', 'colID', 'timestamp', 'DQValue']) # create a dataframe named result with the same columns of CellsMetadata
    
    
    #set identifiers, method and timestamp
    result['rowID'] =table[table.columns[0]] # fill the  rowID column with the row IDs
    result['appMethID'] = checkDateFormat.__name__ #fill the appMethID column  of the dataframe result with the name of the current function 
    result['tableID'] = tableName  #fill the tableID column with the name of the table given as argument
    result['colID'] = attributeName #fill the colID column with the attribute name given as argument
    result['timestamp'] = datetime.now() #fill the column timestamp with the current datetime 
    
    #calculate quality value
    result['DQValue'] =  table[attributeName][table[attributeName].notna()].apply(lambda x: checkFormat(x, dateFormat) ) #fill the column DQValue by applying the checkFormat function to check if every attribute value, 
                                                                                                                         #except NAN values, respects the given date format. 
    
   
    
    return result #return the final result which is a dataframe

### checkValue

This function calculate the ratio of rows that have a specific value in the given argument "attributeName" and that have a not null id.

Input :
   - value : a specific value  
   - attribute : an attribute name that may contain the input value
   - table : a table name; the one containing the input attribute
   - BD : a dictionnary containing the table name as key and the variable that contains the dataframe (that represents a table             in excel file) as a value of the key.
     
Output : A dataframe containing the calculated quality value (and identifiers), organized in a line (granularity: column)  with the following columns :
  
   - colID: identifier of the column in the input table, i.e. the value of the second argument
   - tableID: table name which represents the excel sheet that we worked with, i.e. the value of the third argument
   - appMethID: represents the function name
   - timestamp: represents the time at which we executed the function 
   - DQValue: a ratio, which is the result of applying the algorithm 

In [4]:
def checkValue(BD, attributeName, tableName, value):

    #retrieve the dataframe that represents the tableName given as an argument
    table = BD[tableName]
    
    #retrieve the table shape for further use 
    x,y = table.shape   # x is the number of lines and y is the number of columns
    
    #calculate quality value
    totalValidTuples= table.loc[(table[attributeName]==value) & (~table['ID'].isnull())].shape[0]
    DQMet= totalValidTuples*100/x
    
    #formulate the result into a dataframe 
    result= pd.DataFrame({'appMethID': checkValue.__name__ , 'tableID':tableName, 'colID':attributeName, 'timestamp': datetime.now(), 'DQValue':DQMet},index=[0])

    return result #return the final result which is a dataframe

### checkNotNull function

This function calculates the percentage of non-null values of a given argument "attributeName" considering only the rows whose id is not null. 

Input :
   
   - attribute : an attribute name 
   - table : a table name; the one containing the input attribute
   - BD : a dictionnary containing the table name as key and the variable that contains the dataframe (that represents a table             in excel file) as a value of the key.
     
Output : A dataframe containing the calculated quality value (and identifiers), organized in a line (granularity: columns)  with the following columns :
  
   - colID: identifier of the column in the input table, i.e. the value of the second argument
   - tableID: table name which represents the excel sheet that we worked with, i.e. the value of the third argument
   - appMethID: represents the function name
   - timestamp: represents the time at which we executed the function 
   - DQValue: a percentage, which is the result of applying the algorithm 

In [5]:
def checkNotNull(BD,attributeName, tableName):
    
    #retrieve the dataframe that represents the tableName given as an argument
    table = BD[tableName]
    
    #retrieve the table shape for further use
    x,y = table.shape # x is the number ol lines and y is the number of columns
    
    #calculate quality value
    totalValidTuples = table.loc[(~table[attributeName].isnull()) & (~table['ID'].isnull())].shape[0]
    DQMet= totalValidTuples*100/x
    
    #formulate the result into a dataframe
    result= pd.DataFrame({'appMethID': checkNotNull.__name__ , 'tableID':tableName, 'colID':attributeName, 'timestamp': datetime.now(), 'DQValue':DQMet},index=[0])
    
    return result #return the final result which is a dataframe

### checkMinMaxDomainRule function

This function checks whether the values of a given attribute are between a given min value and max value or not.

Input :
   
   - attribute : an attribute name 
   - table : a table name; the one containing the input attribute
   - BD : a dictionnary containing the table name as key and the variable that contains the dataframe (that represents a table             in excel file) as a value of the key.
   - MIN : a minimum value 
   - MAX : a maximum value
   
Output : A dataframe containing the calculated quality value (and identifiers), organized in a line (granularity: cell)  with the following columns :

   - rowID: identifier of the row in the input table, i.e. the value of the third argument
   - colID: identifier of the column in the input table, i.e. the value of the second argument
   - tableID: table name which represents the excel sheet that we worked with, i.e. the value of the third argument
   - appMethID: represents the function name
   - timestamp: represents the time at which we executed the function 
   - DQValue: represents a boolean value, which is the result of applying the algorithm 

In [7]:
def checkMinMaxDomainRule(BD,attributeName, tableName, MIN, MAX):
    
   '''Here below an intern function checkValueInDomain that has three inputs:
      - a value which will be the value of the given attribute
      - a minValue: minimum value 
      - a maxValue: maximum value.
   
   outputs:
      - nan in case value is null
      - 1 if the value is between minValue and maxValue
      - 0 otherwise                                          
   '''
   def checkValueInDomain(value,minValue,maxValue):
      
      if pd.isna(value):
        return np.nan  # Retourner NaN pour les valeurs manquantes
      if minValue <= value <= maxValue:
        return 1
      else:
        return 0
   #retrieve the dataframe that represents the tableName given as an argument
   table = BD[tableName]
   #create the schema of the resulting dataframe
   result= pd.DataFrame(columns=['appMethID', 'tableID','rowID', 'colID', 'timestamp', 'DQValue'])
   #calculate quality value
   result['DQValue']= table[attributeName].apply(lambda x: checkValueInDomain(x,MIN,MAX))
   #set identifiers, method and timestamp of the resulting dataframe
   result['rowID']= table[table.columns[0]]
   result['appMethID']= checkMinMaxDomainRule.__name__
   result['tableID']= tableName
   result['colID']= attributeName
   result['timestamp']= datetime.now()
   
   return result  #return the final result which is a dataframe

### checkIncreasingDatesInSeries function

This function checks whether the values (dates) of attributes that begin with a given prefix (attributeNamePrefix) are in increasing order or not.

Input :
   
   - attributeNamePrefix : a prefix for a set of attributes.
   - table : a table name; the one containing the input attribute.
   - BD : a dictionnary containing the table name as key and the variable that contains the dataframe (that represents a table             in excel file) as a value of the key.
   
   
Output : A dataframe containing the calculated quality value (and identifiers), organized in entries (granularity: row) with the following columns :

   - rowID: identifier of the row in the input table, i.e. the value of the third argument
   - colID: identifier of the column in the input table, i.e. the value of the second argument
   - tableID: table name which represents the excel sheet that we worked with, i.e. the value of the third argument
   - appMethID: represents the function name
   - timestamp: represents the time at which we executed the function 
   - DQValue: represents a boolean value, which is the result of applying the algorithm 

In [9]:
def checkIncreasingDatesInSeries(BD, attributeNamePrefix, tableName):
  #retrieve the dataframe that represents the tableName given as an argument
  table= BD[tableName]
  #filter the columns that begin with the given prefix (attributeNamePrefix) and store them in a variable 
  table_with_only_dates_columns = table.filter(regex="^"+attributeNamePrefix)
  #retrieve the list of columns name 
  columns= list(table_with_only_dates_columns.columns)
  #create the result dataframe schema
  result= pd.DataFrame({'appMethID':checkIncreasingDatesInSeries.__name__, 'tableID': tableName},index=range(table.shape[0]))
  #create an empty list that will store lists of index,timestamp and the quality value of a single row : [[index,timestamp,qualityValue],...]
  result_list=[]
  for index, row in table_with_only_dates_columns.iterrows(): #iterating every row of the retrieved dataframe "table_with_only_dates_columns"
    is_valid=-1 # quality value
    #check if the row is empty, in this case it will return nan as a quality value
    if row.count()==0:
      is_valid = np.nan
      result_list.append([table.at[index,'ID'], datetime.now(),is_valid]) # adding the rowID, the timestamp and the quality value to the list
      continue # pass to the next row regardless to the following code 
    #check if the row contains one value and in the first column, in this case it will return 1 (valid order) as a quality value
    elif row.count()==1 and row[columns[0]]!= np.nan :
      #valid order
      is_valid=1  
      # adding the rowID, the timestamp and the quality value to the list
      result_list.append([table.at[index,'ID'], datetime.now(),is_valid]) 
      continue # pass to the next row regardless to the following code 
    else : # in other cases do the following 
      for i,j in [(columns[i], columns[i+1]) for i in range(0, table_with_only_dates_columns.shape[1]-1)]: #iterating each pair of columns for each row
          #check if the date of the column i and the date of the column j are not nan 
          if str(row[i])!='nan' and str(row[j])!='nan' :
            date_i= datetime.strptime(str(row[i]), '%d/%m/%Y') #convert the string type of the date in the column i into a date format type 
            date_j= datetime.strptime(str(row[j]), '%d/%m/%Y') #convert the string type of the date in the column j into a date format type 
            if date_i>date_j:  #check if the date in the column i is posterior to the the date in the column j
              #not valid order 
              is_valid=0       
              break
                 
          elif str(row[i])=='nan' and str(row[j]) != 'nan': #check if the date in the column i is nan and the date in the column j is not
            #not valid order 
            is_valid=0
            break  
          else :  
            #valid order 
              is_valid=1
   
    result_list.append([table.at[index,'ID'],datetime.now(),is_valid]) # adding the rowID, the timestamp and the quality value to the list
  #convert the result_list to a dataframe
  df = pd.DataFrame(result_list,columns=['rowID','timestamp','DQValue'])
  #create the resulting dataframe 
  result=pd.concat([result,df],axis=1)
  return result #return the final result which is a dataframe

### checkNbMinAppointment

This function checks if each row of the given table (tableName argument) has a minimum value (given as an argument) of appointments.

Input :
   
   - MINValue : an integer representing a minimum value.
   - table : a table name; the one containing the input attribute.
   - BD : a dictionnary containing the table name as key and the variable that contains the dataframe (that represents a table             in excel file) as a value of the key.
   
   
Output : A dataframe containing the calculated quality value (and identifiers), organized in entries (granularity: row) with the following columns :

   - rowID: identifier of the row in the input table, i.e. the value of the third argument
   - colID: identifier of the column in the input table, i.e. the value of the second argument
   - tableID: table name which represents the excel sheet that we worked with, i.e. the value of the third argument
   - appMethID: represents the function name
   - timestamp: represents the time at which we executed the function 
   - DQValue: represents a boolean value, which is the result of applying the algorithm 

In [10]:
def checkIntraRelationIntegrity_2(BD, MINValue, tableName):
  #retrieve the dataframe that represents the tableName given as an argument
  table= BD[tableName]
  #create a dataframe with a single column DQValue
  nb_appoint=pd.DataFrame(columns=['DQValue'])
  #filter the columns that begin with the prefix date regardless of the case and store them in a variable "cols"
  cols = table.filter(regex='(?i)date').columns
  #retrieve from the previous dataframe all dates excepting the death dates
  cols = cols[~cols.str.contains('décès', case=False)]
  #calculate quality value
  nb_appoint['DQValue']= table[cols].count(axis=1).apply(lambda x: 1 if x >=MINValue else 0)
  #create the schema of the resulting dataframe 
  result= pd.DataFrame({'appMethID':checkIntraRelationIntegrity_2.__name__, 'tableID': tableName,'rowID': table['ID'],'timestamp': datetime.now()},index=range(table.shape[0]))
  #create the resulting dataframe 
  result= pd.concat([result,nb_appoint['DQValue']],axis=1)
  return result #return the final result 

# **Data Loading**

In [11]:
patients = pd.read_excel('C:/imen/pfe_on/CHRU/Sources/CHRU_source.xlsx',sheet_name="Patients", dtype={'DATE_DIAG': str})

In [12]:
patients #printing the dataframe

Unnamed: 0,ID,SEXE,DDN,ACT_PROF,ACT_PROF_PREC,DIAG,AUTR_DIAG,DIAG_PROBA,DATE_DIAG,PREM_SYM,...,Détail de la cause du décès : [ DET_DCD_V ] [M6 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M8 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M10 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M14 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M17 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M18 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M19 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M20 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M21 - F0],Détail de la cause du décès : [ DET_DCD_V ] [M30 - F0]
0,1,Féminin,23/07/1939,Retraité,Agriculteur exploitant,Non encore déterminé,,,,juil-18,...,,,,,,,,,,
1,2,Masculin,18/11/1941,,,Non encore déterminé,,,,janv-18,...,,,,,,,,,,
2,3,Masculin,27/10/1941,Retraité,Cadre d'entreprise,Non encore déterminé,,,,déc-16,...,,,,,,,,,,
3,4,Masculin,09/09/1959,"Contremaître, agent de maîtrise",,SLA,,Forme certaine,juin-17,févr-17,...,,,,,,,,,,
4,5,Masculin,27/05/1960,,,SLA,,Forme probable,mai-17,oct-15,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,1656,Féminin,18/01/1936,,,SLA,,Forme certaine,mars-06,juil-05,...,,,,,,,,,,
1656,1657,Féminin,23/01/1949,Ouvrier non qualifié,,SLA,,Forme certaine,déc-09,juil-09,...,,,,,,,,,,
1657,1658,Féminin,28/11/1947,,,SLA,,Forme probable,nov-21,avr-21,...,,,,,,,,,,
1658,1659,Féminin,20/07/1940,,,SLA,,Forme certaine,mai-21,mai-20,...,,,,,,,,,,


In [13]:
travaux = pd.read_excel('C:/imen/pfe_on/CHRU/Sources/CHRU_source.xlsx',sheet_name="Travaux")
travaux

Unnamed: 0,ID,Date de prélèvement,Sodium,Potassium,Chlorures,Créatinine,Urée,Acide urique,Albumine,Bilirubine totale,...,Polynucléaires basophiles1,Polynucléaires neutrophiles,Polynucléaires neutrophiles1,Polynucl<e9>aires basophiles_1,Polynucl<e9>aires basophiles1_1,Monocytes,Monocytes1,Polynucléaires éosinophiles,Polynucléaires éosinophiles1,Rapport IgG(LCR) ALB(LCR)
0,440932,16/03/2018 22:45,,,,,,,,,...,,81.3,,0.4,,4.0,,0.8,,
1,1807520561,16/03/2018 22:45,136,4.0,92,82,8.7,,,6,...,,,,,,,,,,
2,1807810044,18/03/2018 08:00,,,,,,,,,...,,,,,,,,,,
3,1807720221,18/03/2018 08:00,136,3.8,92,59,5.1,,42,7,...,,,,,,,,,,
4,387640,18/03/2018 08:00,,,,,,,,,...,,71.6,,0.6,,5.3,,3.2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33166,2201110616,11/01/2022 12:25,141,4.1,100,69,,,43.4,10,...,,,,,,,,,,
33167,2201110615,11/01/2022 12:25,,,,,,,,,...,,,,,,,,,,
33168,2209510578,05/04/2022 10:00,,,,,,,,11,...,,,,,,,,,,
33169,2209510576,05/04/2022 10:00,,,,,,,,,...,,,,,,,,,,


In [14]:
excelFile = pd.ExcelFile('C:/imen/pfe_on/CHRU/Sources/CHRU_source.xlsx')

In [15]:
excelFile.sheet_names

['Raw_Data',
 'DIST_ACT_PROF',
 'Patients',
 'Travaux',
 'Patients_Travaux',
 'Stats',
 'DIST_LIEU_DEBUT',
 'DIST_AT_PERSO',
 'Distributions']

In [16]:
BD = {'Raw_Data':None, 'DIST_ACT_PROF':None, 'Patients':patients, 'Travaux':travaux,'Patients_Travaux':None, 'Stats':None, 'DIST_LIEU_DEBUT':None, 'DIST_AT_PERSO':None, 'Distributions':None}

# **Connection to database**

In [17]:
#create an engine that will allow us to connect to the postgres database by specifying the database characteristics (user=postgres,password=enisgi1,host=localhost,port=5432,database_name=new_db)
engine = create_engine('postgresql+psycopg2://postgres:test123@localhost:5432/new_db')


# **Method Execution and result storage**

### Executing the first function and put the result in the variable result_1

In [20]:
result_1= checkValueInCollection(BD,'LIEU_DEBUT','Patients', ['Bulbaire','Spinal','Respiratoire']) #you should enter the table that we will work with as a string and with the same name of the dataframe

In [21]:
result_1 #printing the result_1

Unnamed: 0,appMethID,tableID,rowID,colID,timestamp,DQValue
0,checkValueInCollection,Patients,1,LIEU_DEBUT,2023-04-20 13:00:25.883531,1
1,checkValueInCollection,Patients,2,LIEU_DEBUT,2023-04-20 13:00:25.883531,0
2,checkValueInCollection,Patients,3,LIEU_DEBUT,2023-04-20 13:00:25.883531,0
3,checkValueInCollection,Patients,4,LIEU_DEBUT,2023-04-20 13:00:25.883531,0
4,checkValueInCollection,Patients,5,LIEU_DEBUT,2023-04-20 13:00:25.883531,0
...,...,...,...,...,...,...
1655,checkValueInCollection,Patients,1656,LIEU_DEBUT,2023-04-20 13:00:25.883531,0
1656,checkValueInCollection,Patients,1657,LIEU_DEBUT,2023-04-20 13:00:25.883531,0
1657,checkValueInCollection,Patients,1658,LIEU_DEBUT,2023-04-20 13:00:25.883531,0
1658,checkValueInCollection,Patients,1659,LIEU_DEBUT,2023-04-20 13:00:25.883531,1


### Storing the result of the method checkValue in CellsMetadata Table

In [22]:
result_1.to_sql('CellsMetadata',engine, if_exists='append', index=False) # insert the dataframe result_1 (the result of the first executed function)  in the database table called "CellsMetadata" 
#CellsMetadata will be created only once

### Executing the second function and put the result in the variable result_2

In [26]:
result_2= checkDateFormat(BD,'DATE_DIAG','Patients','%d-%m-%Y') 



In [27]:
result_2 #printing the result_2

Unnamed: 0,appMethID,tableID,rowID,colID,timestamp,DQValue
0,checkDateFormat,Patients,1,DATE_DIAG,2023-04-20 13:03:05.876086,
1,checkDateFormat,Patients,2,DATE_DIAG,2023-04-20 13:03:05.876086,
2,checkDateFormat,Patients,3,DATE_DIAG,2023-04-20 13:03:05.876086,
3,checkDateFormat,Patients,4,DATE_DIAG,2023-04-20 13:03:05.876086,0.0
4,checkDateFormat,Patients,5,DATE_DIAG,2023-04-20 13:03:05.876086,0.0
...,...,...,...,...,...,...
1655,checkDateFormat,Patients,1656,DATE_DIAG,2023-04-20 13:03:05.876086,0.0
1656,checkDateFormat,Patients,1657,DATE_DIAG,2023-04-20 13:03:05.876086,0.0
1657,checkDateFormat,Patients,1658,DATE_DIAG,2023-04-20 13:03:05.876086,0.0
1658,checkDateFormat,Patients,1659,DATE_DIAG,2023-04-20 13:03:05.876086,0.0


### Storing the result of the method checkFormatDate in CellsMetadata Table

In [28]:
result_2.to_sql('CellsMetadata',engine, if_exists='append', index=False) # insert the dataframe result_2 (the result of the second executed function) in the database table called "CellsMetadata" 

### Executing the third function and put the result in the variable result_3


In [29]:
result_3 = checkValue(BD,'DIAG', 'Patients', 'SLA')

In [30]:
result_3

Unnamed: 0,appMethID,tableID,colID,timestamp,DQValue
0,checkValue,Patients,DIAG,2023-04-20 13:07:17.503491,60.180723


### Storing the result of the method checkTotalPatient in ColumnsMetadata Table

In [31]:
result_3.to_sql('ColumnsMetadata',engine, if_exists='append', index=False) # insert the dataframe result_3 (the result of the third executed function) in the database table called "ColumnsMetadata" 

### Executing the fourth function and put the result in the variable result_4

In [32]:
result_4 = checkNotNull(BD,'Cholestérol','Travaux')

In [33]:
result_4

Unnamed: 0,appMethID,tableID,colID,timestamp,DQValue
0,checkNotNull,Travaux,Cholestérol,2023-04-20 13:07:31.818313,11.850713


### Storing the result of the method checkTotalPatient in ColumnsMetadata Table

In [34]:
result_4.to_sql('ColumnsMetadata',engine, if_exists='append', index=False) # insert the dataframe result_4 (the result of the fourth executed function) in the database table called "ColumnsMetadata" 

### Executing the fifth function and put the result in the variable result_5

In [35]:
result_5 = checkMinMaxDomainRule(BD,'Score ALS FRS-R : [ ALS_V ] [M0 - F0]','Patients',0,48)

In [36]:
result_5

Unnamed: 0,appMethID,tableID,rowID,colID,timestamp,DQValue
0,checkMinMaxDomainRule,Patients,1,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0
1,checkMinMaxDomainRule,Patients,2,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0
2,checkMinMaxDomainRule,Patients,3,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0
3,checkMinMaxDomainRule,Patients,4,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0
4,checkMinMaxDomainRule,Patients,5,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,
...,...,...,...,...,...,...
1655,checkMinMaxDomainRule,Patients,1656,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0
1656,checkMinMaxDomainRule,Patients,1657,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0
1657,checkMinMaxDomainRule,Patients,1658,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0
1658,checkMinMaxDomainRule,Patients,1659,Score ALS FRS-R : [ ALS_V ] [M0 - F0],2023-04-20 13:09:23.523332,1.0


In [37]:
result_5.to_sql('CellsMetadata',engine, if_exists='append', index=False) # insert the dataframe result_5 (the result of the fifth executed function) in the database table called "CellsMetadata" 

### Executing the sixth function and put the result in the variable result_6

In [38]:
result_6 = checkIncreasingDatesInSeries(BD,"Date de l'examen",'Patients')

In [39]:
result_6

Unnamed: 0,appMethID,tableID,rowID,timestamp,DQValue
0,checkIncreasingDatesInSeries,Patients,1,2023-04-20 13:13:45.890653,1.0
1,checkIncreasingDatesInSeries,Patients,2,2023-04-20 13:13:45.890653,1.0
2,checkIncreasingDatesInSeries,Patients,3,2023-04-20 13:13:45.890653,0.0
3,checkIncreasingDatesInSeries,Patients,4,2023-04-20 13:13:45.891655,1.0
4,checkIncreasingDatesInSeries,Patients,5,2023-04-20 13:13:45.891655,
...,...,...,...,...,...
1655,checkIncreasingDatesInSeries,Patients,1656,2023-04-20 13:13:46.431706,0.0
1656,checkIncreasingDatesInSeries,Patients,1657,2023-04-20 13:13:46.431706,0.0
1657,checkIncreasingDatesInSeries,Patients,1658,2023-04-20 13:13:46.432658,1.0
1658,checkIncreasingDatesInSeries,Patients,1659,2023-04-20 13:13:46.432658,1.0


In [40]:
result_6.to_sql('RowsMetadata',engine, if_exists='append', index=False)

### Executing the seventh function and put the result in the variable result_7

In [42]:
result_7 = checkIntraRelationIntegrity_2(BD,5,"Patients")

In [43]:
result_7

Unnamed: 0,appMethID,tableID,rowID,timestamp,DQValue
0,checkIntraRelationIntegrity_2,Patients,1,2023-04-20 13:14:54.776445,1
1,checkIntraRelationIntegrity_2,Patients,2,2023-04-20 13:14:54.776445,0
2,checkIntraRelationIntegrity_2,Patients,3,2023-04-20 13:14:54.776445,1
3,checkIntraRelationIntegrity_2,Patients,4,2023-04-20 13:14:54.776445,1
4,checkIntraRelationIntegrity_2,Patients,5,2023-04-20 13:14:54.776445,0
...,...,...,...,...,...
1655,checkIntraRelationIntegrity_2,Patients,1656,2023-04-20 13:14:54.776445,1
1656,checkIntraRelationIntegrity_2,Patients,1657,2023-04-20 13:14:54.776445,1
1657,checkIntraRelationIntegrity_2,Patients,1658,2023-04-20 13:14:54.776445,1
1658,checkIntraRelationIntegrity_2,Patients,1659,2023-04-20 13:14:54.776445,1


In [44]:
result_7.to_sql('RowsMetadata',engine, if_exists='append', index=False)

In [45]:
%ldir

 Le volume dans le lecteur C n'a pas de nom.
 Le num‚ro de s‚rie du volume est 5C76-0D46

 R‚pertoire de C:\Users\DELL

20/04/2023  13:17    <DIR>          .
20/04/2023  13:17    <DIR>          ..
15/05/2022  20:20    <DIR>          .anaconda
29/11/2022  00:26    <DIR>          .android
13/04/2022  21:44    <DIR>          .cache
15/05/2022  20:22    <DIR>          .conda
27/11/2022  23:07    <DIR>          .config
04/05/2022  13:53    <DIR>          .continuum
30/03/2023  13:19    <DIR>          .DbSchema
15/11/2022  01:15    <DIR>          .eclipse
13/12/2022  20:11    <DIR>          .ganttproject.d
15/11/2022  01:32    <DIR>          .gradle
04/11/2022  16:06    <DIR>          .groovy
19/04/2023  16:58    <DIR>          .ipynb_checkpoints
04/05/2022  13:39    <DIR>          .ipython
18/11/2022  16:03    <DIR>          .jenkins
14/05/2022  10:19    <DIR>          .jupyter
27/11/2022  18:25    <DIR>          .lemminx
14/10/2022  16:12    <DIR>          .m2
04/05/2022  13:39    <DIR>   