
# Libraries and Tools used

In [1]:
"""Our graphical elements are going to be done with matplotlib, inline graphs"""
%matplotlib inline
import matplotlib.pyplot as plt

"""CSV: utility to work with the most common format for datastorage : Comma Separated Values"""
import csv

"""JSON: utility to work with JavaScript Object Notation, the original information format storage we """
import json
from collections import namedtuple

"""Pandas: python's most used library to work with datasets"""
import pandas as pd 

""" Type detection for our dataframe """
from pandas.api.types import (
    is_categorical_dtype,
    is_datetime64_dtype,
    is_object_dtype,
    is_numeric_dtype,
    is_string_dtype
)

"""Numpy: python's most used library to work with large amounts of numbers"""
import numpy as np
"""OS: utility to work with paths and file openings independently of operating system"""
import os
"""Glob: used to look for file extensions inside given folders"""
import glob 

"""Candidates for ML predictive model implementations"""
import tensorflow as tf
import sklearn
import catboost

  from ._conv import register_converters as _register_converters


# The Storage module

### Methods 

#### __init__
#### print_info
#### from_json
#### to_json
#### update_stats
#### load_from_dataframe


In [216]:
class StoreDomain():
    
    def __init__(self, attribute_list, path=os.getcwd(), domain_name=None):
        """ Handy initialization """
        self.load(attribute_list, path, domain_name)
        
    def load(self, attribute_list, path, domain_name):
        """ Given a dataset searches for a domain in the specified folder.
            If found, loads it into the class itself. If not found loads an empty template.
        """
        for fnm in glob.glob(os.path.join(path,"*.json")):
            with open(fnm) as f:
                data = json.load(f)
            if data["attribute_list"] == attribute_list:
                self.modify(data)
                self.fnm = fnm
                break;
                        
        if not hasattr(self,"fnm"):
            print("No matching knowledge found for your domain, setting an empty one")
            self.attribute_list = attribute_list
            self.domain_name = domain_name or str(attribute_list)
            self.knowledge = {
                "dataset_stats" : {},
                "column_stats": {}
            }
            self.fnm = glob.glob(os.path.join( path, str(self.domain_name)+".json") )
                    
    def save(self):
        """ Saves itself as an object in .json format
        """
        print("" "", self.fnm)
        with open(self.fnm,'w') as f:
            json.dump(self.__dict__, f)
            
    def modify(self,data):
        """ Modifies the class from a physical representation (dict) of it
        """   
        self.domain_name = data["domain_name"] if "domain_name" in data else self.domain_name
        self.attribute_list = data["attribute_list"] if "attribute_list" in data else self.attribute_list
        self.knowledge = data["knowledge"] if "knowledge" in data else self.knowledge
        self.fnm = data["fnm"] if "fnm" in data else self.fnm

    def print_info(self):
        """ Util to print attributes
        """
        print(self.__dict__)


# The JSON
&nbsp;

 This object is used to represent the raw information stored.

 It contains statistics and properties from both statistics and datasets, different to different types, as well as how they were measured (by saving the function to be called on a pandas dataset in the case of a dataset stat, or on the values themselves in the case of a function stat).

 Saving how they're measured is important to measure new datasets and to be able to compare metrics effectively.

 Both the column specific stats and the dataset stats are subjective to change, and the knowledgedomain object can be modified and adapted to fit new parameters easily.

 This structure is generated for a single dataset and then combined with the domain one to take account of the new one.



In [None]:
KNOWLEDGE_STRUCTURE = {
    "dataset_stats": {
        "individuals": 0,
        "max_rows": {
            "value": 1400,
            "metric" : "count_rows_null",
        }, # alert if new rows are <<<, as different results are skewed
        "avg_rows": {
            "value": 1400,
            "metric" : "count_rows_null"
        },
        "full_rows": {
            "value": 0.97
        }
    },
    "column_stats": {
        "a": {
            "type": "numeric",
            "stats": {
                "median": 12.6,
                "std_dev": 1.3,
                "max": 25.5,
                "min": 2.3,
                "NaNs": 0.02 # as % of dataset
            }     
        },
        "b": {
            "type": "categorical",
            "stats": {
                "most_frequent": "cat",
                "values": {
                    "cat": 0.2,
                    "dog": 0.6
                },
                "nan": 0.1
            }
            
        }

    }
}


# The analysis module
## Metrics

In [3]:
""" The purpose of these functions is to provide a way to measure the properties of a given dataset or knowledge domain.
    We can categorize them as follows:
    First the "measurement" metrics, used to get the information of a single dataset or domain.
    
    - Dataset Metrics : they concern the dataset as a whole, like number of rows with missing values.
        dm :: (ds) --> num
        
    - Single Column Metrics : they concern a certain column, and are based on the type of the column.
      For numerical columns we will have things like median, averages, deviations, distributions...
      For categorical columns we'll work with frequencies and things of the sort.
        scm :: (col) --> num
        
    - Multiple Column Metrics : we will be looking for correlations and things of that sort.
       scm :: (col,col) --> num 
       Time based metrics will be defined from this construct.
    
    We will also have "comparison" metrics, used to compare datasets against their domains.
    These metrics will compare the output of two measurement metrics, both will have to
    spawn from the same function.
        comp_m :: (metric) --> num
    
    Note that these metrics are not to provide "meaning" or any human-readable input, nor to be
    inherently comparable between each other outside of a framework of understanding of the domain
    (metric importance).
    
    A mean to convert these machine cold metrics into human understanding will be provided in further 
    modules. For now, we're not taking humans into account.
"""

""" Dataset Metrics """

def count_rows_null(df):
    return df.isnull().shape(0)

""" Columns """

""" For consistency, every column is accepted as a pandas Series """
""" Numeric, accepted as pandas Series """

def median(col):
    return np.median(col.values)

def average(col):
    return np.average(col.values)



""" All types """

def count_not_null(col):
    return np.count_nonzero(~np.isnan(data))

def count_null(col):
    return col.size - count_not_null(col)



""" Categorical """

def count_freqs(col):
    return 1.2
def count_nans(col):
    return 0.9



## Utility functions

In [217]:
def median_raw(a):
    return np.median(a)
def most_frequent(a):
    return np.bincount(a).argmax()
IS_DTYPE = {
    "number": is_numeric_dtype,
    "category": is_categorical_dtype,
    "datetime": is_datetime64_dtype,
}

DEFAULT_METRICS = { #(type,comparison_metric)
    "numerical" : median_raw,
    "categorical": most_frequent,
    "dataset" : {
        "count_rows":{
            "metric": 2
        },
        "rows_with_nulls":{
            "metric": 3
        }
    }
}

COMPARISON_METRICS = { #(metric(name of a function) , comparison_metric)
    
}

In [218]:
class DatasetAnalyzer():
    
    def __init__(self, df, path):
        self.dataset = df            
        self.std = StoreDomain(df.name, domain_name=list(df.columns), path=path)
        self.domain_knowledge = self.std.knowledge
        self.dataset_knowledge = self.get_stats(df)
        
    def get_stats(self, df):
        """ Populates the stats dictionary """
        return {
            "column_stats": self.get_column_stats(df),
            "dataset_stats": self.get_dataset_stats(df)
        }
        #if k has been correctly created it is assumed to have a metric field
        return stats
    
    def get_column_stats(self,df):
        """ Column stats """

        stats = {}
        col_types = self.get_column_types(df)
        
        for c in self.get_column_types(df):
            if c in self.domain_knowledge["column_stats"]:
                func = globals()[self.domain_knowledge["column_stats"][c]["metric"]]
            else:
                func = DEFAULT_METRICS[col_types[c]]
                
            stats[c] = {
                    "metric" : func.__name__,
                    "value" : func(df[c].values)
            }
            
        return stats
    
    def get_dataset_stats(self,df):
        """ Dataset stats """
        stats = {}
        dataset_metrics = self.domain_knowledge["dataset_stats"]
        for m in dataset_metrics:
            stats[m] = {
                "metric": globals()[dataset_metrics[m]["metric"]],
                "value": globals()[dataset_metrics[m]["metric"]](df)
            }
            
        return stats
    
    def get_column_types(self, df):
        """ Get column types """
        col_types = {}
        for c in df.columns:
            col_types[c] = "numerical" if IS_DTYPE['number'](df[c].dtype) else "categorical"
            
        return col_types
    
    def fill_d1_with_d2(self, d1, d2):
        """Add extra stats from d2 (default) to d1 (our dict)"""
        for key in d2:
            if (key in d1 and isinstance(d1[key], dict) and isinstance(d2[key], dict)):
                self.deep_merge_dicts(d1[key], d2[key])
            else:
                d1[key] = d2[key]

    def get_analysis(self): #generates report, saves it in self.report AND returns it.
        """ This method will return a JSON containing the comparison between the dataset knowledge
            and the domain knowledge
        """    

        col_stats_comparison = {
            "column_stats": self.compare_stats(self.domain_knowledge["column_stats"], self.dataset_knowledge["column_stats"])
        }
        
        dataset_stats_comparison = {
            "dataset_stats": self.compare_stats(self.domain_knowledge["dataset_stats"], self.dataset_knowledge["dataset_stats"])
        }
        
        return dict(col_stats_comparison, **dataset_stats_comparison)
    
    def compare_stats(self,stats1, stats2):
        """ Compares two dictionaries with the same structure """
        comparison = {}

        if stats1:
            for c in stats1:
                if(stats1[c]["metric"] == stats1[c]["metric"]):
                    #TODO: use a real comparison metric
                    comparison[c] = stats2[c]["value"] - stats2[c]["value"]
            return comparison
        else:
            return stats2
    
    def update_stats(self): 
        new_stats = {}
        #TODO : customize this so weights can be other things aswell
        n = self.domain_knowledge["n"]
        for c in self.dataset_knowledge:
            self.domain_knowledge[c]["value"] = (n * self.domain_knowledge[c]["value"] + self.dataset_knowledge[c]["value"]) / (n + 1)
        
        #TODO : customize this so you're able to take into account the number of rows
        self.domain_knowledge["n"] = n + 1
        
        #Save the information
        self.std.knowledge = self.domain_knowledge


# CBR Database

In [219]:
class StoreCBR(StoreDomain):
    """ StoreDomain but for human knowledge. Extends StoreDomain to add profile functionality
        and CBR-based behaviour.
        Each storecbr is tied to a profile and it can only modify it
    """
    def __init__(self, attribute_list, profile, path, domain_name=None):
        """ Handy initialization """
        self.load(attribute_list, profile, path, domain_name)
        
    def load(self, attribute_list, profile, path, domain_name):
        """ Given a dataset searches for a domain in the specified folder(s) and loads
        """
        for fnm in glob.glob(os.path.join(path,"*.json")):
            with open(fnm) as f:
                data = json.load(f)
            if data["attribute_list"] == attribute_list:
                self.profile = profile
                self.domain_name = data["domain_name"] if "domain_name" in data else str(attribute_list)
                self.attribute_list = data["attribute_list"] if "attribute_list" in data else self.attribute_list
                self.profiles = data["profiles"] 
                self.fnm = fnm
                break;
                        
        if not hasattr(self,"fnm"):
            self.set_defaults(domain_name ,attribute_list, path, profile)
            
    def set_defaults(self, domain_name ,attribute_list, path, profile):
        """
        """
        self.domain_name = domain_name or attribute_list
        self.attribute_list = attribute_list
        self.profile = profile

        self.fnm = glob.glob(os.path.join( path, self.domain_name+".json") )
        self.knowledge = {
            "profiles": {}
        }
        self.knowledge["profiles"][profile] = {
            "profile_knowledge" : {}
        }
        
    def modify(self, new_info):
        self.profiles[self.profile] = new_info
        
    def run_tournament(self): pass
    """ELO """    
    def interaction_learning(self): pass
    """ despues"""

# Reporter

In [220]:
""" 
    Another kind of information is stored about the domains. This is the 
    information concerning the **human** side of things, that is, **how** 
    to interpret these stats and turn them into something that humans with different levels of 
    familiarity can understand.
    To do this, we provide use another storage class that will contain human-relevant data that
    will modify the objective comparison delivered by the analysis module.
"""

class Reporter():
    
    def __init__(self, df, profile, domain_path, human_knowledge_path):
        self.profile = profile
        self.dsa = DatasetAnalyzer(df, path=domain_path)
        self.scbr = StoreCBR(list(df.columns), profile, path=human_knowledge_path, domain_name=df.name)
        self.generate()
        
    def generate(self):

        self.analysis = self.dsa.get_analysis()
        self.human_knowledge = self.scbr.profiles[self.scbr.profile]["knowledge"]
        self.report = {"dataset":{},"columns":{},"colors":{}}
        
        for k in self.analysis["dataset_stats"].keys():
            self.report["dataset"][k] = {
                "value": self.analysis["dataset_stats"][k]["value"] * self.human_knowledge["dataset_stats"][k]["value"]
            }

        for k in self.analysis["column_stats"].keys():
            self.report["columns"][k] = {
                "value": self.analysis["column_stats"][k]["value"] * self.human_knowledge["column_stats"][k]["value"]
            }

        for k in self.human_knowledge["colors"].keys():
            self.report["colors"][k] = {
                "value" : "blue"
            }
    def modify(self,new_info):
        self.human_info.modify(new_info)
    def save_human_info(self,new_info):
        self.human_info.save(new_info)

# Cardio Test

In [224]:
%%time
df = pd.read_csv("data/cardio_train.csv",sep=";")
df.name = "medical_data"

#Set cols as categorical
cat_cols = ["cholesterol", "gluc", "smoke",  "alco", "active", "cardio", "gender"]

for c in cat_cols:
    df[c] = df[c].astype("category")
df.head()
repo = Reporter(df, profile="patient", domain_path="./domain_storage/", human_knowledge_path="./human_storage/")


No matching knowledge found for your domain, setting an empty one
CPU times: user 135 ms, sys: 35.3 ms, total: 170 ms
Wall time: 176 ms


In [225]:
repo.report

{'colors': {'active': {'value': 'blue'},
  'age': {'value': 'blue'},
  'alco': {'value': 'blue'},
  'ap_hi': {'value': 'blue'},
  'ap_lo': {'value': 'blue'},
  'cardio': {'value': 'blue'},
  'cholesterol': {'value': 'blue'},
  'gender': {'value': 'blue'},
  'gluc': {'value': 'blue'},
  'height': {'value': 'blue'},
  'id': {'value': 'blue'},
  'smoke': {'value': 'blue'},
  'weight': {'value': 'blue'}},
 'columns': {'active': {'value': 0.3},
  'age': {'value': 6758.129000000001},
  'alco': {'value': 0.0},
  'ap_hi': {'value': 27.6},
  'ap_lo': {'value': 18.400000000000002},
  'cardio': {'value': 0.0},
  'cholesterol': {'value': 0.3},
  'gender': {'value': 0.245},
  'gluc': {'value': 0.3},
  'height': {'value': 23.924999999999997},
  'id': {'value': 23500.704999999998},
  'smoke': {'value': 0.0},
  'weight': {'value': 10.44}},
 'dataset': {}}

# Diabetic Test

In [221]:
%%time
df = pd.read_csv("data/diabetic.csv")
df = df.fillna(0)
df.name = "diabetic"
del df["Notes"]
del df["ID"]
cat_cols = ["week","weekday"]
for c in cat_cols:
    df[c] = df[c].astype("category")
df.head()
repo = Reporter(df, profile="patient", domain_path="./domain_storage/", human_knowledge_path="./human_storage/")


No matching knowledge found for your domain, setting an empty one
CPU times: user 14 ms, sys: 3.95 ms, total: 17.9 ms
Wall time: 20.3 ms


In [222]:
repo.report

{'colors': {'M0700': {'value': 'blue'},
  'M0930': {'value': 'blue'},
  'M1300': {'value': 'blue'},
  'M1500': {'value': 'blue'},
  'M1800': {'value': 'blue'},
  'M2000': {'value': 'blue'},
  'M2300': {'value': 'blue'},
  'i0700': {'value': 'blue'},
  'i0930': {'value': 'blue'},
  'i1300': {'value': 'blue'},
  'i1500': {'value': 'blue'},
  'i1800': {'value': 'blue'},
  'i2300': {'value': 'blue'},
  'week': {'value': 'blue'},
  'weekday': {'value': 'blue'}},
 'columns': {'M0700': {'value': 24.0},
  'M0930': {'value': 15.899999999999999},
  'M1300': {'value': 19.5},
  'M1500': {'value': 18.9},
  'M1800': {'value': 12.299999999999999},
  'M2000': {'value': 18.6},
  'M2300': {'value': 0.0},
  'i0700': {'value': 12.0},
  'i0930': {'value': 0.0},
  'i1300': {'value': 14.399999999999999},
  'i1500': {'value': 0.0},
  'i1800': {'value': 18.0},
  'i2300': {'value': 0.0},
  'week': {'value': 3},
  'weekday': {'value': 3}},
 'dataset': {}}

# Heart test

In [205]:
%%time
df = pd.read_csv("data/heart.csv")

df.name = "heart"
cat_cols = ["sex","cp", "fbs", "restecg", "exang", "slope", "ca", "thal", "target"]
for c in cat_cols:
    df[c] = df[c].astype("category")
df.head()
repo = Reporter(df, profile="patient", domain_path="./domain_storage/", human_knowledge_path="./human_storage/")


No matching knowledge found for your domain, setting an empty one
CPU times: user 17.8 ms, sys: 3.12 ms, total: 20.9 ms
Wall time: 19.9 ms


In [206]:
repo.report

{'colors': {'M0700': {'value': 'blue'},
  'M0930': {'value': 'blue'},
  'M1300': {'value': 'blue'},
  'M1500': {'value': 'blue'},
  'M1800': {'value': 'blue'},
  'M2000': {'value': 'blue'},
  'M2300': {'value': 'blue'},
  'i0700': {'value': 'blue'},
  'i0930': {'value': 'blue'},
  'i1300': {'value': 'blue'},
  'i1500': {'value': 'blue'},
  'i1800': {'value': 'blue'},
  'i2300': {'value': 'blue'},
  'week': {'value': 'blue'},
  'weekday': {'value': 'blue'}},
 'columns': {'age': {'value': 165.0},
  'ca': {'value': 0},
  'chol': {'value': 720.0},
  'cp': {'value': 0},
  'exang': {'value': 0},
  'fbs': {'value': 0},
  'oldpeak': {'value': 2.4000000000000004},
  'restecg': {'value': 3},
  'sex': {'value': 3},
  'slope': {'value': 6},
  'target': {'value': 3},
  'thal': {'value': 6},
  'thalach': {'value': 459.0},
  'trestbps': {'value': 390.0}},
 'dataset': {}}

# Frontend module
Logic-free, just turns an information json to a user-readable report with images, which then prints out for the user to see.

In [None]:
class TextTemplates:
    def __init__(self):
    def red_json(self,):
PRETTY_JSON = {
    "graphs": {
        "some_col":{
            "type" : "bars",
            "cat_colors" : {
                "cat" : "red",
                "dog" : "blue",
            },
            "bg_color" : "white"
        }
    },
    "text" : {
        "some_col_avg" : {
            "reason" : "high",
            "number_ds" : 1.3,
            "number_domain": 0.2
        }
    }
}

In [10]:
GRAPH_CLASSES = {
    "bars" : "asdasd",
    "pie" : "asdasd",
    "timeline" : ""
    #etc
}
class FrontEnd:
    def __init__(self, config):
        """Self explaining, associate graph with """
        #self.graphs_config = config["graphs"]
       # self.text_config = config["text"]
    def generate_graphs(self):
        for config in self.graphs_config:
            print(config)
    def generate_text(self):
        for config in self.text_config:
            print()
        #text things
        #graphical things
        pass

In [13]:
EXEC_PARAMS = {
    "domain_name" : "animals",
    "attribute_list": [] ,
    "knowledge" : {}
}
if __name__ == "__main__":
    """First we read our data"""
    df = pd.read_csv("nonsense.csv")
    #Do everything from the reporter class to get the report
    #Feed the report to the frontend class for the pretty representation
    print("henlo")

henlo
