# Importing all necessary packages

In [86]:
import numpy as np
import pandas as pd
import scipy.stats
import random
from fitter import Fitter
import attr
from collections import defaultdict

# Class Definition

In [138]:
class FitDistribution(object):
    """Fit and simulate data to known distributions.
    
    Input:
    ------
    - data: array-like or dataframe.
    - dists: list.
        This parameter contains a list of distributions to be explored.
        When None, every available distribution on scipy is explored.
    - bins: int. 
        Numbers of bins to be used for the cumulative histogram. This has
        an impact on the quality of the fit.
    - timeout: int. 
        Maximum time for a given distribution. If timeout is reached, 
        the distribution is skipped.
        """
    def __init__(self, data, dists=None, timeout=30, verbose=False, bins=100):
        self.data = data
        #self.var_types = var_types
        self.dists = dists
        self.timeout = timeout
        self.verbose = verbose
        self.bins = bins
        self.ArrayDistDict = defaultdict(dict)
        self.params_dict = defaultdict(dict)

    def FindArrayDist(self, cat_var):
        """Function to extract the best distribution for a specified array.
        Uses the fit method from the Fitter module in the fitter library
        Inputs:
        -------
        - cat_var: boolean
            Boolean to signify whether the variable to be simulated
            is discrete/categorical or continuous.
        
        Outputs:
        -------
        By default, the function returns a dictionary with best distribution name
        and parameters associated with it. If a number of distributions
        was specified, the function returns a pandas DataFrame with
        the N best distributions, along with a plot showing all of them."""
        self.ArrayDistDict = dict()
        if  cat_var == True:
            self.ArrayDistDict['distribution'] = 'categorical'
            np_array_range = np.arange(self.data.max()+1)
            array_bincount = np.bincount(self.data)
            probs = array_bincount / len(self.data)
            
            self.ArrayDistDict['parameters'] = [np_array_range,
                                                probs]            
        else:
            fitter_object = Fitter(data=self.data,
                                   distributions=self.dists,
                                   timeout=self.timeout)
            fitter_object.fit()
            BestDict = fitter_object.get_best()
            self.ArrayDistDict['distribution'] = list(BestDict.items())[0][0]
            self.ArrayDistDict['parameters'] = list(BestDict.items())[0][1]
        return self.ArrayDistDict
    
    def SimArray(self, size=100):
        """Function to simulate data for an array based on the best fitted
        distribution.
        Input:
        -----
        - size : int
                size of the array to be simulated.
        Outputs:
        -------
        Simulated array based on the best fit distribution."""
        if self.ArrayDistDict['distribution'] == 'categorical':
            Sim_Array = np.random.choice(a=self.ArrayDistDict['parameters'][0],
                                         p=self.ArrayDistDict['parameters'][1],
                                         size=size)
        else:
            dist = getattr(scipy.stats, self.ArrayDistDict['distribution'])
            Sim_Array = dist.rvs(*self.ArrayDistDict['parameters'], size=size)
        return Sim_Array
    
    def FindDfDist(self, var_types):
        """Function to extract the best distribution from a specified dataframe.
        Uses the function find_dist, which in turn uses the fit method from the
        Fitter module in the fitter library
        Inputs:
        -------
        - var_types: dictionary
            Dictionary with keys as column names for dataset variables, the value
            of each key is a string showing whether the variable is discrete/cat
            or continuous.

        Outputs:
        -------
        *FOR NOW*, the function returns a dictionary showing the best distribution
        name for each array in the dataframe and parameters associated with it.
        """
        
        
        for column in list(self.data.columns):
            if  var_types[column] == 'categorical':
                self.params_dict[column]['distribution'] = 'categorical'
                np_array_range = np.arange(self.data[column].max()+1)
                array_bincount = np.bincount(self.data[column])
                probs = array_bincount / len(self.data[column])
                self.params_dict[column]['parameters'] = [np_array_range,
                                                          probs]            
            else:
                fitter_object = Fitter(data=self.data[column],
                                       distributions=self.dists,
                                       timeout=self.timeout)
                fitter_object.fit()
                BestDict = fitter_object.get_best()
                self.params_dict[column]['distribution'] = list(BestDict.items())[0][0]
                self.params_dict[column]['parameters'] = list(BestDict.items())[0][1]
        return self.params_dict

    def SimDf(self, size=1000):
        """Funtion to simulate data of size N based on specified
        distribution/parameters found by the fitter package.
        Inputs:
        -------
        data: dataframe from which columns are to be taken
        dist_params: the distribution parameters from find_dist_df
        Outputs:
        -------
        DataFrame object with simulated data based on specified distributions
        """
        Sim_Df = pd.DataFrame(columns=list(self.params_dict.keys()))
        Sim_Df = Sim_Df.fillna(0)
        for column in list(self.params_dict.keys()):
            if self.params_dict[column]['distribution'] == 'categorical':
                data_sim = np.random.choice(a=self.params_dict[column]['parameters'][0],
                                            p=self.params_dict[column]['parameters'][1],
                                            size=size)
                Sim_Df[column] = data_sim
            else:
                dist = getattr(scipy.stats, self.params_dict[column]['distribution'])
                data_sim = dist.rvs(*self.params_dict[column]['parameters'], size=size)
                Sim_Df[column] = data_sim
        return Sim_Df