
# Libraries and Tools used

In [78]:
"""Our graphical elements are going to be done with matplotlib, inline graphs"""
%matplotlib inline
import matplotlib.pyplot as plt
"""CSV: utility to work with the most common format for datastorage : Comma Separated Values"""
import csv
"""JSON: utility to work with JavaScript Object Notation, the original information format storage we """
import json
from collections import namedtuple
"""Pandas: python's most used library to work with datasets"""
import pandas as pd 
"""Numpy: python's most used library to work with large amounts of numbers"""
import numpy as np
"""OS: utility to work with paths and file openings independently of operating system"""
import os
"""Glob: used to look for file extensions inside given folders"""
import glob 

"""Candidates for ML predictive model implementations"""
import tensorflow as tf
import sklearn
import catboost


# The Storage module

### Methods 

#### __init__
#### print_info
#### from_json
#### to_json
#### update_stats
#### load_from_dataframe
### TODO
- Set paths as environment variables (?)

In [123]:
class StoreDomain:
    
    def __init__(self, domain_name=None, attribute_list=None, knowledge=None):
        """ Handy initialization, if no values are provided everything is set to None
            This way we can check wether an attribute has been set by doing 
            if not self.{attribute} 
        """
        
        self.domain_name = domain_name
        self.attribute_list = attribute_list
        self.knowledge = knowledge

    def from_json(self,data):
        """ Loads the class from a physical representation (dict) of it
        """
        
        self.domain_name = data["domain_name"]
        self.domain_name = data["attribute_list"]
        self.knowledge = data["knowledge"]

    def print_info(self):
        """ Util to print attributes
        """
        print(self.domain_name, self.attribute_list, self.knowledge)
        

    def load_from_dataframe(self, df, paths=[os.getcwd()]):
        """ Given a dataset searches for a domain in the specified folder(s)
        """
        found = False
        for path in paths:
            for fnm in glob.glob(os.path.join(path,"*.json")):
                with open(fnm) as f:
                    data = json.load(f)
                if data["attribute_list"] == list(df.columns):
                    self.from_json(data)
                    found = True
                    break;
        
        if not found:
            print("No matching knowledge found for your domain, setting an empty one")
            self.attribute_list = list(df.columns)
            self.domain_name = df.name
            self.knowledge = {}

    # Open all files 
    def update_stats(self, knowledge):
        """ Syntactic sugar used by the analyzer to update the domain knowledge before saving it
        """
        self.knowledge = knowledge
        
    def to_json(self,fnm=None):
        """ Saves itself as an object in .json format given in the specificed fnm
        """
        if fnm is None:
            fnm = df.name + ".json"
        with open(fnm,'w') as f:
            json.dump(self.__dict__, f)

In [124]:
## StoreDomain tests

df = pd.read_csv("nonsense.csv")
del df['Unnamed: 0']

df.name = "nonzenze"
std = StoreDomain()

std.load_from_dataframe(df)
std.knowledge = KNOWLEDGE_STRUCTURE

std.to_json()
std.print_info()
#dom = sd.domain_from_dataset(ds,paths)

['/Users/brick/Desktop/Uni/tfg/notebooks']
['a', 'b'] None {'dataset_stats': {'individuals': 0, 'max_rows': {'value': 1400, 'metric': 'count_rows_null'}, 'avg_rows': {'value': 1400, 'metric': 'count_rows_null'}, 'full_rows': {'value': 0.97}}, 'column_stats': {'a': {'type': 'numeric', 'color': 'blue', 'stats': {'median': 12.6, 'std_dev': 1.3, 'max': 25.5, 'min': 2.3, 'NaNs': 0.02}}, 'b': {'type': 'categorical', 'color': 'red', 'stats': {'most_frequent': 'cat', 'values': {'cat': 0.2, 'dog': 0.6}, 'nan': 0.1}}}}


In [93]:
data = json.loads(json.dumps({"a":[1,2], "b":2}))
b = {}
for k,v in data.items():
    b[k] = data[k]
b

{'a': [1, 2], 'b': 2}

# The JSON
&nbsp;

 This object is used to represent the raw information stored.

 It contains statistics and properties from both statistics and datasets, different to different types, as well as how they were measured (by saving the function to be called on a pandas dataset in the case of a dataset stat, or on the values themselves in the case of a function stat).

 Saving how they're measured is important to measure new datasets and to be able to compare metrics effectively.

 Both the column specific stats and the dataset stats are subjective to change, and the knowledgedomain object can be modified and adapted to fit new parameters easily.

 This structure is generated for a single dataset and then combined with the domain one to take account of the new one.



In [34]:
KNOWLEDGE_STRUCTURE = {
    "dataset_stats": {
        "individuals": 0,
        "max_rows": {
            "value": 1400,
            "metric" : "count_rows_null",
        }, # alert if new rows are <<<, as different results are skewed
        "avg_rows": {
            "value": 1400,
            "metric" : "count_rows_null"
        },
        "full_rows": {
            "value": 0.97
        }
    },
    "column_stats": {
        "a": {
            "type": "numeric",
            "color": "blue",
            "stats": {
                "median": 12.6,
                "std_dev": 1.3,
                "max": 25.5,
                "min": 2.3,
                "NaNs": 0.02 # as % of dataset
            }
            
        },
        "b": {
            "type": "categorical",
            "color": "red",
            "stats": {
                "most_frequent": "cat",
                "values": {
                    "cat": 0.2,
                    "dog": 0.6
                },
                "nan": 0.1
            }
            
        }

    }
}


# The analysis module


In [None]:
""" Metrics """
def count_rows_null(df):
    return df.isnull().shape(0)

In [28]:
class DatasetAnalyzer():
    def __init__(self, ds, knowledge=None):
        self.dataset = ds
        if not knowledge:
            # now search and load information from domain
            self.domain_knowledge = StoreDomain(ds,paths=[])
        else:
            self.domain_knowledge = knowledge
        self.dataset_knowledge = self.get_stats(ds)

    def __init__(self,ds,knowledge):
        self.dataset = ds
        self.domain_knowledge = knowledge
        
    def load_knowledge(self, dataset):
        std = StoreDomain()
    def get_stats(self, dataset):
        stats = {}
        for c in dataset.columns:
            #for every column save on the 
            stats.column_stats[c] = globals()[self.knowledge.column_stats[c].metric](dataset[c].values)
        
        for k in self.knowledge.dataset_stats:
            stats.dataset_stats[k] = globals()[self.knowledge.dataset_stats[k].metric](dataset)
        
        return stats
    
    def compare(self):
        # compare self.domain_knowledge and self.dataset_knowledge
        # store analysis result in self.result
        return 0

# Report generation

In [8]:
class ReportGenerator:
    def __init__(self, analysis_result, domain_info):
        self.analysis_result = analysis_result
        print("  henlo")
        
    def generate():
        # compare the result analysis with our knowledge of the domain, then select the relevant stuff, store json
        
        #in self.report_json
        return 0

# Frontend module
Logic-free, just turns an information json to a user-readable report with images, which then prints out for the user to see.

In [None]:
class TextTemplates:
    def __init__(self):
    def red_json(self,):
PRETTY_JSON = {
    "graphs": {
        "some_col":{
            "type" : "bars",
            "cat_colors" : {
                "cat" : "red",
                "dog" : "blue",
            },
            "bg_color" : "white"
        }
    },
    "text" : {
        "some_col_avg" : {
            "reason" : "high",
            "number_ds" : 1.3,
            "number_domain": 0.2
        }
    }
}

In [10]:
class FrontEnd:
    def __init__(self, config):
        """Self explaining, associate graph with """
        self.graphs_config = config.graphs
        self.text_config = config.text
    def generate_graphs(self):
        for config in self.graphs_config:
            print(config)
    def generate_text(self):
        for config in self.text_config:
            print()
        #text things
        #graphical things
        return 0

# Program flow

In [31]:
EXEC_PARAMS = {
    "domain_name" : "animals",
    "attribute_list": [] ,
    "knowledge" : KNOWLEDGE_STRUCTURE
}
if __name__ == "__main__":
    """First we read our data"""
    df = pd.read_csv("nonsense.csv")
    dsa = DatasetAnalyzer(df)
    #load dataset into analyzer
    #get report generator
    #use the froentend module to present it to the user
    #if the user gives feedback, frontend saves it as important in the store (?) information
    print("henlo")

TypeError: __init__() missing 1 required positional argument: 'knowledge'