In [1]:
import re
import types
import warnings
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from time import time
from functools import partial
from typing import List, Callable
from sklearn.cluster import KMeans
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import networkx 

from pyvis.network import Network
from pomegranate import BayesianNetwork

warnings.filterwarnings('ignore') 

In [2]:
def read_file(filename: str="data/corona.csv") -> pd.core.frame.DataFrame:
    """
    Read a file 
    
    :param filename: File's name
    
    :return: The dataframe
    """
    try:
        df = pd.read_csv(filename).dropna(how="all").drop_duplicates()
    except:
        print(f"Error occurred while reading the file {filename}")
    return df    

In [3]:
class ProcessingOrder:
    """
    This function imposes an ORDER of execution (some operations must be executed before others).
    It allows the execution of several processing's method entered bu a user in the form of list, 
    The input list might be in the wrong order, this function will manage the order for us 
    Order of priority : DESCENDING (order(1) must be exceuted before order(2)) 
    """
    # variable static
    execution_plan = {}
    private_methods = []
    
    def __init__(self, o):
        self.o = o
    
    def __call__(self, f):
        ProcessingOrder.execution_plan[self.o] = f
        assert f.__name__.startswith('__')
        ProcessingOrder.private_methods.append(f.__name__[2:])
        return f

order = ProcessingOrder

class ProcessingC:
    
    def __private_wrapper(self, fname, **kwargs):
        self.__m[fname] = kwargs
        return self


    def __init__(self, steps={}):
        self.__m = {} #__methods_arguments
        
        # Create public interfaces of the private methods
        for m in ProcessingOrder.private_methods:
            self.__dict__[m] = types.MethodType(partial(ProcessingC.__private_wrapper, fname=m), self)
       
        # Add arguments of stemps to the methods arguments memory (or update if already exists)
        for method, kwargs in steps.items():
            self.__getattribute__(method)(**kwargs)
            print(method.upper(), "append", kwargs)
        

    @order(1)
    def __read_csv(self, df) -> None:
        """
        We can provide directly a Dataframe, or the path of file 
        """
        self.data = read_file(filename=df) if type(df) == str else df
    
    @order(2)
    def __rename_columns(self, columns: dict = {}, uppercase: bool = True) -> None:
        """
        Column renaming
        :param columns: Concerned column
        :param uppercase: if "True" uppercase columns else put only the first letter in uppercase 
        """
        self.data.rename(columns=columns, inplace=True)
        self.data.columns = self.data.columns.str.strip().str.upper() if uppercase \
                            else self.data.columns.str.strip().str.title()
            
    @order(3)
    def __drop_useless_feature(self, columns: List[str] = [], frac_col: float = .7, frac_row: float = .45) -> None:
        """
        This function removes a list of input columns and the columns/rows, 
        that contain many missing values from the dataframe
        :param columns: List of columns to be removed
        :param frac_col: Delete columns that have a factor of frac_col * len(df) of missing values
        :param frac_row: Delete rows that have a factor of frac_row * len(df) of missing values
        
        """
        # Find the columns that contain more than frac_col * len(df)
        drop_columns = [column for column, value in self.data.isnull().sum(0).to_dict().items() 
                            if value >= frac_col * self.data.shape[0]]
        # Delete both the list of the input columns and the computed column above
        self.data.drop(columns=columns + drop_columns, inplace=True)
        # Find the rows that contain more than frac_row * len(df)
        self.data.drop(self.data[self.data.isnull().sum(1) >= frac_row * self.data.shape[1]].index, inplace=True)
        
    @order(4)
    def __to_date(self, pattern: str = r'\d{2,3}\W{1}\d{2}\W{1}\d{2,3}\s*(\d{2}:\d{2}:\d{2})?', n: int = 3) -> None:
        """
        Often the dates in a dataframe are in string format when they should be in datatime format 
        This function finds the columns that contain dates according to a pattern.
        
        :param pattern: How a date format can be found in a df
        :param n: When you find columns that contain potential dates (here, date_col), we apply a health check 
        to a random set of values to ensure that the contains looks like a date format
        
        After retrieving the relevant columns, we convert them into a cyclic feature to allow the Machine Learning 
        algorithm to better understand this feature (it occurs in cycles rather than ascenfing values)
        
        A common method for encoding cyclical data is to transform the data into 
        two dimensions using a sine and consine transformation over years, months, days, hours ...
        Here I choose to deal with months and days 
        
        For more details, see: https://www.kaggle.com/avanwyk/encoding-cyclical-features-for-deep-learning
        """              
        
        health_check = lambda column: (sum([bool(re.search(pattern, value)) for value in self.data[column].sample(n)])) == n     
        date_columns = [column for column in self.data.columns 
                        if re.search("date", column.lower()) and health_check(column)]
        
        for col in date_columns:            
            # Convert to datetime format
            self.data[col] = pd.to_datetime(self.data[col]) #, errors="NoteDate")  
            # The date is a cycling feature
            self.data[f"{col}_month_sin"] = np.sin(2 * np.pi * self.data[col].dt.month / 12)
            self.data[f"{col}_month_con"] = np.cos(2 * np.pi * self.data[col].dt.month / 12)
            self.data[f"{col}_day_sin"]   = np.sin(2 * np.pi * self.data[col].dt.day / 31)
            self.data[f"{col}_day_con"]   = np.cos(2 * np.pi * self.data[col].dt.day / 31)
            
        # Save the original data 
        self.date_columns_copy = self.data[date_columns].copy()
        # Delete the column, because it's useless afterwards
        self.data.drop(columns=date_columns, inplace=True)
        
    @order(5)
    def __missing_values(self, method: str = "median", n_clusters: int = 4):
        """"
        Replace the missing values according to a input method
        :param method: The strategy used to replace the missing values, it can be: median, mean or other 
        """
        # Save the id of nan index df[np.isnan(df['b'])] 
        self.mv = {col: self.data[np.isnan(self.data[col])].index.tolist() 
                   for col in self.data.columns[self.data.isna().any()].tolist()}
        if method == "mean":
            self.data.fillna(self.data.mean(), inplace=True)
        elif method == "median":
            self.data.fillna(self.data.median(), inplace=True)       
        else:
            try:
                df[col].fillna(method=method, inplace=True)
            except: print(f"Error Fillna: method {method}, isn't implemented")
                
        # We cannot apply median or mean on string columns
        for col in self.data.select_dtypes(include=['object']).columns: # Find string type columns
            self.data[col] = self.data[col].fillna("NaN") # Replace with a string, the NaN type is not supported by the algorithms. 
        
        # Go furtherto 
        # Kmeans ???? 
        # How should I do with sting type columns ? One Hot Encoding ??
        # Pandas provides OHE with pd.get_dummies(df,prefix=[col]), but it will create a column for each values
        # But, our models are sensitive to the number of variables        
        
    @order(6)
    def __continuous_to_categorial(self, transform: dict = {}, threshold: int = 10, 
                                 interval: int = 5, nb_point: int = 4):
        """
        Transform continuous variables to categorial variables 
        :param transform: A processing method is specified for each set of columns  
        :param threshold: The minimum number of states, that a variable can has
        :param interval : How much value are there in a bins
        :param nb_points: Minimum number of points in a cluster
        """
        # Find all continuous columns that should be converted, according to a givin threshold
        all_cc = list(filter(lambda col: self.data[col].nunique() >= threshold, self.data.columns))
        # Find string type columns 
        str_cc = list(self.data.select_dtypes(include=['object']).columns)
        # Put in the right format the list of columns to convert 
        #print("columns", self.data.columns)
        print("tran", transform)
        input_c = np.array(list(itertools.chain.from_iterable(transform.values())))[:, 0].tolist() if len(transform) else []
        print("str", str_cc)
        print("input", input_c)
        # Find the rest of the variables that have not been givin 
        to_discret = list(filter(lambda col: col not in input_c and col not in str_cc, all_cc))
        print("rest", to_discret)
        
        for method, columns in transform.items():
            if method[0] == "qcut":
                for col in columns:                    
                    # Compte the appropriate number of bins for a given column
                    bins = int(len(self.data[col]) / interval) if not method[1] else method[1]
                    # If an Erro occurs here, check the bins and interval
                    #print(f"Col: {col}, Error bins = {bins}, distinct values {self.data[col].nunique()}, decrease the number of interval {interval}")
                    self.data[col] = pd.qcut(self.data[col].values, bins)                        
            elif method[0] == "kmeans":
                for col in columns:
                    # Compte the appropriate number of cluster for a given column
                    n_clusters = int(len(self.data[col]) / nb_points) if not method[1] else method[1]
                    kmeans = KMeans(n_clusters=n_clusters).fit(self.data[col].values.reshape(-1, 1))
                    self.data[col] = kmeans.predict(self.data[col].values.reshape(-1, 1)) 
            # else EM
        # The remain column to be converted, the default method is qcut
        for col in to_discret:
            bins = int(len(self.data[col]) / interval)
            # If an Erro occurs here, check the bins and interval
            #print(f"Col: {col}, Error bins = {bins}, distinct values {self.data[col].nunique()}, decrease the number of interval {interval}")
    
            self.data[col] = pd.qcut(self.data[col].values, bins, duplicates="drop")

    def fit(self):
        # Excetute the plan in the right order (only the function specified by the builder or the steps)
        for o, f in sorted(ProcessingOrder.execution_plan.items()):
            if f.__name__[2:] in self.__m:
                self.__getattribute__(f"_{self.__class__.__name__}{f.__name__}")(**self.__m[f.__name__[2:]])
        return self

In [4]:
# Example execution
p = ProcessingC(steps={
    'read_csv': 
        {'df': "data/heart.csv"},
    'rename_columns': 
        {'columns': {"trestbps": "TREST_BPS", "restecg":"REST_ECG", "exang":"ANG"}, 'uppercase': "upper"},
    'drop_useless_feature': 
        {'columns': ['TREST_BPS', 'REST_ECG', 'SLOPE',  'THAL', 'CHOL', 'FBS']},
    'to_date': {'n': 3},
    'missing_values': {'method': "median"}, 
    'continuous_to_categorial': {'transform': {'qcut': [("THALACH", 3)]}, 'interval': 30, 'threshold': 10}
}).fit()
p.data

READ_CSV append {'df': 'data/heart.csv'}
RENAME_COLUMNS append {'columns': {'trestbps': 'TREST_BPS', 'restecg': 'REST_ECG', 'exang': 'ANG'}, 'uppercase': 'upper'}
DROP_USELESS_FEATURE append {'columns': ['TREST_BPS', 'REST_ECG', 'SLOPE', 'THAL', 'CHOL', 'FBS']}
TO_DATE append {'n': 3}
MISSING_VALUES append {'method': 'median'}
CONTINUOUS_TO_CATEGORIAL append {'transform': {'qcut': [('THALACH', 3)]}, 'interval': 30, 'threshold': 10}
tran {'qcut': [('THALACH', 3)]}
str []
input ['THALACH']
rest ['AGE', 'OLDPEAK']


Unnamed: 0,AGE,SEX,CP,THALACH,ANG,OLDPEAK,CA,TARGET
0,"(62.0, 66.0]",1,3,150,0,"(1.9, 2.8]",0,1
1,"(28.999, 42.0]",1,2,187,0,"(2.8, 6.2]",0,1
2,"(28.999, 42.0]",0,1,172,0,"(1.16, 1.4]",0,1
3,"(55.5, 58.0]",1,1,178,0,"(0.4, 0.8]",0,1
4,"(55.5, 58.0]",0,0,163,1,"(0.4, 0.8]",0,1
...,...,...,...,...,...,...,...,...
298,"(55.5, 58.0]",0,0,123,1,"(-0.001, 0.4]",0,0
299,"(42.0, 45.0]",1,3,132,0,"(1.16, 1.4]",0,0
300,"(66.0, 77.0]",1,0,141,0,"(2.8, 6.2]",2,0
301,"(55.5, 58.0]",1,0,115,1,"(1.16, 1.4]",1,0


In [5]:
# Example execution
p = ProcessingC(steps={
    'read_csv': 
        {'df': "data/corona.csv"},
    'rename_columns': 
        {'columns': {}, 'uppercase': "upper"},
    'drop_useless_feature': 
        {'columns': ['STATE']},
    'to_date': {'n': 3},
    'missing_values': {'method': "median"}, 
    'continuous_to_categorial': {'interval': 100, 'threshold': 10}
}).fit()
p.data

READ_CSV append {'df': 'data/corona.csv'}
RENAME_COLUMNS append {'columns': {}, 'uppercase': 'upper'}
DROP_USELESS_FEATURE append {'columns': ['STATE']}
TO_DATE append {'n': 3}
MISSING_VALUES append {'method': 'median'}
CONTINUOUS_TO_CATEGORIAL append {'interval': 100, 'threshold': 10}
tran {}
str ['STATE ABBREVIATION']
input []
rest ['POPULATION', 'TOTAL GDP LAST Q (X 1 MIL.)', 'GDP/CAPITA', '# RESIDENTS/SQUARE MILE', '# OF CONFIRMED CASES', '# OF CONFIRMED CASES PER 100K', 'DAYS SINCE BAR/RESTAURANT LIMITS', 'DAYS SINCE COST-SHARING WAIVER (TREATMENT)', 'DAYS SINCE EARLY RX REFILL', 'DAYS SINCE EMERGENCY DECLARATION', 'DAYS SINCE EXECUTIVE ORDER MANDATING COMPLIANCE WITH STATE GUIDANCE', 'DAYS SINCE FREE VACCINE', 'DAYS SINCE LARGE GATHERINGS BAN', 'DAYS SINCE MANDATORY QUARANTINE', 'DAYS SINCE MARKETPLACE SEP', 'DAYS SINCE NON-ESSENTIAL BUSINESS CLOSURES', 'DAYS SINCE PAID SICK LEAVE', 'DAYS SINCE PEACETIME EMERGENCY', 'DAYS SINCE PRIMARY ELECTION POSTPONEMENT', 'DAYS SINCE PUBLIC H

Unnamed: 0,STATE ABBREVIATION,POPULATION,TOTAL GDP LAST Q (X 1 MIL.),GDP/CAPITA,IS BORDER STATE,# RESIDENTS/SQUARE MILE,# OF CONFIRMED CASES,# OF CONFIRMED CASES PER 100K,DAYS SINCE BAR/RESTAURANT LIMITS,DAYS SINCE CIVIL PREPAREDNESS EMERGENCY,...,PUBLIC HEALTH EMERGENCY,SCHOOL CLOSURES,SECTION 1135 WAIVER,WAIVE PRIOR AUTH,NEW CASES PER 100K 15 DAYS LATER,NUMBER OF ACTIVE NPIS,DATE_month_sin,DATE_month_con,DATE_day_sin,DATE_day_con
0,AL,"(4467673.0, 5639632.0]","(195858.0, 247711.0]","(40151.999, 47346.0]",0,"(74.6, 111.0]","(-1.001, 0.0]","(-0.0335, 0.0]","(-0.001, 3.0]",0,...,0,0,0,0,"(-0.0204, 0.133]","(-0.001, 1.0]",0.866025,5.000000e-01,"(-0.651, -0.394]","(0.689, 0.919]"
1,AL,"(4467673.0, 5639632.0]","(195858.0, 247711.0]","(40151.999, 47346.0]",0,"(74.6, 111.0]","(-1.001, 0.0]","(-0.0335, 0.0]","(-0.001, 3.0]",0,...,0,0,0,0,"(0.133, 0.354]","(-0.001, 1.0]",1.000000,6.123234e-17,"(-0.101, 0.25]","(0.919, 0.98]"
2,AL,"(4467673.0, 5639632.0]","(195858.0, 247711.0]","(40151.999, 47346.0]",0,"(74.6, 111.0]","(-1.001, 0.0]","(-0.0335, 0.0]","(-0.001, 3.0]",0,...,0,0,0,0,"(0.133, 0.354]","(-0.001, 1.0]",1.000000,6.123234e-17,"(0.25, 0.485]","(0.689, 0.919]"
3,AL,"(4467673.0, 5639632.0]","(195858.0, 247711.0]","(40151.999, 47346.0]",0,"(74.6, 111.0]","(-1.001, 0.0]","(-0.0335, 0.0]","(-0.001, 3.0]",0,...,0,0,0,0,"(0.133, 0.354]","(-0.001, 1.0]",1.000000,6.123234e-17,"(0.485, 0.651]","(0.689, 0.919]"
4,AL,"(4467673.0, 5639632.0]","(195858.0, 247711.0]","(40151.999, 47346.0]",0,"(74.6, 111.0]","(-1.001, 0.0]","(-0.0335, 0.0]","(-0.001, 3.0]",0,...,0,0,0,0,"(0.354, 0.491]","(-0.001, 1.0]",1.000000,6.123234e-17,"(0.651, 0.791]","(0.347, 0.689]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,WV,"(1344212.0, 1792147.0]","(57106.0, 78270.0]","(40151.999, 47346.0]",0,"(63.4, 74.6]","(2.0, 4.0]","(0.195, 0.381]","(-0.001, 3.0]",0,...,1,1,0,0,"(0.354, 0.491]","(3.0, 4.0]",1.000000,6.123234e-17,"(-0.968, -0.898]","(-0.44, -0.251]"
1270,WV,"(1344212.0, 1792147.0]","(57106.0, 78270.0]","(40151.999, 47346.0]",0,"(63.4, 74.6]","(0.0, 1.0]","(0.0461, 0.102]","(-0.001, 3.0]",0,...,1,1,0,0,"(0.354, 0.491]","(3.0, 4.0]",1.000000,6.123234e-17,"(-1.0, -0.968]","(-0.44, -0.251]"
1271,WV,"(1344212.0, 1792147.0]","(57106.0, 78270.0]","(40151.999, 47346.0]",0,"(63.4, 74.6]","(2.0, 4.0]","(0.195, 0.381]","(-0.001, 3.0]",0,...,1,1,0,0,"(0.354, 0.491]","(5.0, 7.0]",1.000000,6.123234e-17,"(-1.0, -0.968]","(-0.251, -0.0506]"
1272,WV,"(1344212.0, 1792147.0]","(57106.0, 78270.0]","(40151.999, 47346.0]",0,"(63.4, 74.6]","(2.0, 4.0]","(0.195, 0.381]","(-0.001, 3.0]",0,...,1,1,0,0,"(0.354, 0.491]","(5.0, 7.0]",1.000000,6.123234e-17,"(-1.0, -0.968]","(-0.0506, 0.151]"
