#### This notebook is now primarily used as a scrap notebook for quick code-testing purposes.

In [None]:
# Format verifier.

In [94]:
import pandas as pd
import numpy as np

In [95]:
df = pd.read_csv('../data/raw_listings.csv')

In [96]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [97]:
#historical_dataset_info.json

In [98]:
import os
import json
import logging

from scipy import stats

In [178]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [203]:
np.sqrt(-1)

  np.sqrt(-1)


nan

In [205]:

class FormatVerifier:
    def __init__(self, p_value=0.05, missing_value_deviation=0.1):
        """
        TODO: Descr.
        """
        self.p_value = p_value
        self.missing_deviation = missing_value_deviation
        
        # Setup Logging.
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[
                logging.FileHandler("format_verifier.log", encoding='utf-8'),
                logging.StreamHandler()
            ]
        )
        # Load config file.
        with open('config_verifier.json', 'r', encoding='utf-8') as f:
                self.config = json.load(f)
        
        # Load, if historical_dataset_info file exists.
        if os.path.exists('historical_dataset_info.json'):
            with open('historical_dataset_info.json', 'r', encoding='utf-8') as f:
                self.historical_info = json.load(f)
        else:
            self.historical_info = {
                'names': {
                    'variable_names': [],
                    'value_names': []
                },
                'statistics': {}
            }
            
        pass
            
    def t_test(self, x_stats, y_stats):
        """
        Customized t-test.

        Sources: 
            [1] https://www.medcalc.org/calc/comparison_of_means.php
            [2] https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f
        """
        # Get the pooled standard deviation.
        s = np.sqrt(
            ((x_stats['samples'] - 1)*x_stats['std']**2 + (y_stats['samples'] - 1)*y_stats['std']**2)  / (x_stats['samples'] + y_stats['samples'] - 2)
        )

        # Get the t-statistic. + 0.0001 is to avoid dividing by 0.
        t = (x_stats['mean'] - y_stats['mean'])/(s*np.sqrt(2/(x_stats['samples'] + y_stats['samples'])) + 0.0001)

        # Get the degrees-of-freedom.
        df = 2*(x_stats['samples'] + y_stats['samples']) - 2

        # Get the p-value.
        p = 1 - stats.t.cdf(t,df=df)

        return p
        
    def check_names(self, df):
        """
        Checks for any old and any new formats.
        """
        column_names = df.columns.values

        # Split pseudo-categorical variables to get their Variable and Value information.
        column_names_split = [name.split('_') for name in column_names]
        variable_names = [name[0] for name in column_names_split if len(name) == 1]
        value_names = [name[1] for name in column_names_split if len(name) == 2]

        # Check and report if all of the old variables are present. 
        # Values are not checked because they can vary day-to-day.
        variable_names_not_found = [name for name in self.historical_info['names']['variable_names'] if name not in variable_names]
        
        if len(variable_names_not_found) > 0:
            logging.warning('Variables expected, but not found in the dataset: {}'.format(variable_names_not_found))
        else:
            logging.info('Found all of the expected Variables.')
        
        variable_names_new = [name for name in variable_names if name not in self.historical_info['names']['variable_names']]
        value_names_new = [name for name in variable_names if name not in self.historical_info['names']['value_names']]
        
        # Report any new Variable/Value names if any were found.
        if len(variable_names_new) > 0:
            logging.warning('Found previously unseen Variables: {}'.format(variable_names_new))
            self.historical_info['names']['variable_names'].extend(variable_names_new)
        else:
            logging.info('Found no new Variables')
        
        if len(value_names_new) > 0:
            logging.warning('Found previously unseen value names: {}'.format(value_names_new))
            self.historical_info['names']['value_names'].extend(value_names_new)
        else:
            logging.info('Found no new Values.')
        
        pass
    
    def check_types(self, df):
        """
        Checks if all the data types match up. Everything should be numeric except strings in config.
        """
        
        names_strings = df.select_dtypes('object').columns.values
        names_strings_unexpected = [name for name in names_strings if name not in self.config['types']['string']]
        
        if len(names_strings_unexpected) > 0:
            logging.warning('Found variables are not expected to be "object" type: {}.'.format(names_strings_unexpected))
        else:
            logging.info('Found no new "ojbect" type variables.')
        pass
    
    
    def check_statistics(self, df):
        """
        Checks for statistical differences between historical_statistics and current batch.
        """
        
        # Get current batch statistics, add missing value percentage as well as number of samples.
        statistics = df.select_dtypes(exclude='object').describe().T
        statistics = statistics[['mean', 'std', 'min', 'max']]
        statistics['missing'] = df.isna().mean()
        statistics['samples'] = df.count()
        statistics['samples_total'] = df.shape[0]
        statistics['sum'] = df.sum()
        statistics['sum_squares'] = (df.select_dtypes(exclude='object') ** 2).sum()
        
        # Split variables into ones with historical statistical data and ones without.
        variables_existing = [name for name in statistics.index.values if name in self.historical_info['statistics'].keys()]
        variables_new = [name for name in statistics.index.values if name not in self.historical_info['statistics'].keys()]
        
        # Save new variable statistics and report. 
        for variable in variables_new:
            self.historical_info['statistics'][variable] = statistics.loc[variable].to_dict()
        
        logging.info('Saved new statistics for Variables: {}'.format(variables_new))

            
        # Statistical tests.
        variables_failed_test = []
        variables_failed_missing = []
        for variable in variables_existing:
            
            # Perform a t-test, add to variables_failed if it failed the test.
            p_value = self.t_test(self.historical_info['statistics'][variable], statistics.loc[variable].to_dict())
            if p_value <= self.p_value:
                variables_failed_test.append(variable)
                
            # Compare missing values with self.missing_deviation to see if it's more than expected.
            missing_difference =abs(self.historical_info['statistics'][variable]['missing'] - statistics.loc[variable, 'missing'])
            if missing_difference >= self.missing_deviation:
                variables_failed_missing.append(variable)
                
                
            # Update the historical info of existing variables.
            # Mean, sample sizes.
            samples_total_new = self.historical_info['statistics'][variable]['samples_total'] + statistics.loc[variable, 'samples_total']
            samples_new = self.historical_info['statistics'][variable]['samples'] + statistics.loc[variable, 'samples']
            mean_new = (self.historical_info['statistics'][variable]['mean'] + statistics.loc[variable, 'mean']) / samples_new
            
            # Update the standard deviation. Source: https://stackoverflow.com/questions/1174984/how-to-efficiently-calculate-a-running-standard-deviation
            sum_total = statistics.loc[variable, 'sum'] + self.historical_info['statistics'][variable]['sum']
            sum_total_squares = statistics.loc[variable, 'sum_squares']  + self.historical_info['statistics'][variable]['sum_squares']
            n_samples = statistics.loc[variable, 'samples'] + statistics.loc[variable, 'samples']
            
            # Get standard error if sigma is > 0, otherwise set it to nan.
            sigma = (sum_total_squares / n_samples) - (sum_total / n_samples) ** 2
            std_new = np.sqrt(sigma) if sigma >= 0 else np.nan
            
            # Update min and max values.
            if statistics.loc[variable, 'min'] < self.historical_info['statistics'][variable]['min']:
                min_new = statistics.loc[variable, 'min']
            else:
                min_new = self.historical_info['statistics'][variable]['min']
                
            if statistics.loc[variable, 'max'] > self.historical_info['statistics'][variable]['max']:
                max_new = statistics.loc[variable, 'max']
            else:
                max_new = self.historical_info['statistics'][variable]['max']
            
            # Update the missing value percentage.
            missing_weight_old = self.historical_info['statistics'][variable]['samples_total'] / (self.historical_info['statistics'][variable]['samples_total'] + statistics.loc[variable, 'samples_total'])
            missing_weight_new = statistics.loc[variable, 'samples_total'] / (self.historical_info['statistics'][variable]['samples_total'] + statistics.loc[variable, 'samples_total'])
            
            missing_new = self.historical_info['statistics'][variable]['missing'] * missing_weight_old + statistics.loc[variable, 'missing'] * missing_weight_new
            
            
            # Set the updated values to historical_info.
            self.historical_info['statistics'][variable] = {
                'std': std_new,
                'mean': mean_new,
                'min': min_new,
                'max': max_new,
                'missing': missing_new,
                'samples': samples_new,
                'samples_total': samples_total_new,
                'sum': sum_total,
                'sum_squares': sum_total_squares
            }
            
        
        # Log the results.
        logging.info('Updated statistics for all existing Variables.')
        
        if len(variables_failed_test) > 0:
            logging.warning('Found Variables that have failed the statistical test with p-value of {0}: {1}.'.format(self.p_value, variables_failed_test))
        else:
            logging.info('All variables passed the statistical tests succesfully with a p-value of {0}.'.format(self.p_value))
               
        if len(variables_failed_missing) > 0:
            logging.warning('Found Variables that have failed the missing value check with missing value percentage deviation of {0}: {1}.'.format(self.missing_deviation, variables_failed_missing))
        else:
            logging.info('All variables passed the missing value check with missing value percentage deviation of {0}.'.format(self.missing_deviation))
            
        
        # Saving the updated historical info into a file.
        with open('historical_dataset_info.json', 'w') as f:
            json.dump(self.historical_info, f, cls=NpEncoder)
        
        logging.info('Succesfully updated historical_dataset_info.json file.')
        pass
    
    def verify(self, df):
        """
        Final function combining all of the merging.
        """
        
        logging.info('Executing data checks.')
        self.check_names(df)
        self.check_types(df)
        self.check_statistics(df)
        logging.info('Succesfully executed all the data checks.')
        
        pass
        

format_verifier = FormatVerifier()
format_verifier.f

2020-05-26 15:31:57,977 [INFO] Found all of the expected Variables.
2020-05-26 15:31:57,979 [INFO] Found no new Variables
2020-05-26 15:31:57,980 [INFO] Found no new Values.
2020-05-26 15:31:57,985 [INFO] Found no new "ojbect" type variables.
2020-05-26 15:31:58,317 [INFO] Saved new statistics for Variables: []
2020-05-26 15:31:58,368 [INFO] Updated statistics for all existing Variables.
2020-05-26 15:31:58,369 [INFO] All variables passed the statistical tests succesfully with a p-value of 0.05.
2020-05-26 15:31:58,370 [INFO] All variables passed the missing value check with missing value percentage deviation of 0.1.
2020-05-26 15:31:58,383 [INFO] Succesfully updated historical_dataset_info.json file.


In [13]:
import numpy as np

In [133]:
missing_old = 2
total_old = 4

missing_new = 3
total_new = 20

missing_p_old = missing_old / total_old
missing_p_new = (missing_new + missing_old) / (total_new + total_old)

missing_p_new

0.20833333333333334

In [134]:
missing_p_old

0.5

In [137]:
missing_p_old*(total_old / (total_old + total_new)) + (missing_new / total_new) * (total_new / (total_old + total_new))

0.20833333333333331

In [None]:
{
    "types": {
        "string": ["ObjectDescription", "ListingUrl", "RealtorName", "RealtorOrganization",
                   "BuildingEnergyClass", "BuildingEnergyClassCategory", "BuildingCity",
                   "BuildingNeighbourhood", "BuildingStreet", "PastatoTipas", "Šildymas", 
                   "Įrengimas", "NamoNumeris", "ButoNumeris", "VidutiniškaiTiekKainuotųŠildymas1Mėn",
                   "PastatoEnergijosSuvartojimoKlasė", "Įrengimas", "Šildymas",
                   "PastatoEnergijosSuvartojimoKlasė"]
    }
}