#### This notebook is now primarily used as a scrap notebook for quick code-testing purposes.

In [None]:
# Format verifier.

In [67]:
import pandas as pd
import numpy as np

In [68]:
df = pd.read_csv('../data/raw_listings.csv')

In [69]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [70]:
#historical_dataset_info.json

In [106]:
import os
import json
import logging

from scipy import stats

In [109]:

class FormatVerifier:
    def __init__(self, p_value=0.05, missing_value_deviation=0.1):
        """
        TODO: Descr.
        """
        self.p_value = p_value
        self.missing_deviation = missing_value_deviation
        
        # Setup Logging.
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[
                logging.FileHandler("format_verifier.log", encoding='utf-8'),
                logging.StreamHandler()
            ]
        )
        
        # Load, if historical_dataset_info file exists.
        if os.path.exists('historical_dataset_info.json'):
            with open('historical_dataset_info.json', 'r') as f:
                self.historical_info = json.load(f)
        else:
            self.historical_info = {
                'names': {
                    'variable_names': [],
                    'value_names': []
                },
                'types': {
                    'string': [],
                },
                'statistics': {
                    "ArtimiausiaMokymoĮstaiga": {
                        'std': 203.43218318364976,
                        "mean": 294.328542,
                        "min": 1.00,
                        "max": 980.0, 
                        "missing": .026,
                        "samples": 500
                    }
                }
            }
            
        pass
            
    def t_test(self, x_stats, y_stats):
        """
        Customized t-test.

        Sources: 
            [1] https://www.medcalc.org/calc/comparison_of_means.php
            [2] https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f
        """
        # Get the pooled standard deviation.
        s = np.sqrt(
            ((x_stats['samples'] - 1)*x_stats['std']**2 + (y_stats['samples'] - 1)*y_stats['std']**2)  / (x_stats['samples'] + y_stats['samples'] - 2)
        )

        # Get the t-statistic.
        t = (x_stats['mean'] - y_stats['mean'])/(s*np.sqrt(2/(x_stats['samples'] + y_stats['samples'])))

        # Get the degrees-of-freedom.
        df = 2*(x_stats['samples'] + y_stats['samples']) - 2

        # Get the p-value.
        p = 1 - stats.t.cdf(t,df=df)

        return p
        
    def check_names(self, df):
        """
        Checks for any old and any new formats.
        """
        column_names = df.columns.values

        # Split pseudo-categorical variables to get their Variable and Value information.
        column_names_split = [name.split('_') for name in column_names]
        variable_names = [name[0] for name in column_names_split if len(name) == 1]
        value_names = [name[1] for name in column_names_split if len(name) == 2]

        # Check and report if all of the old variables are present. 
        # Values are not checked because they can vary day-to-day.
        variable_names_not_found = [name for name in self.historical_info['names']['variable_names'] if name not in variable_names]
        
        if len(variable_names_not_found) > 0:
            logging.warning('Variables expected, but not found in the dataset: {}'.format(variable_names_not_found))
        else:
            logging.info('Found all of the expected Variables.')
        
        variable_names_new = [name for name in variable_names if name not in self.historical_info['names']['variable_names']]
        value_names_new = [name for name in variable_names if name not in self.historical_info['names']['value_names']]
        
        # Report any new Variable/Value names if any were found.
        if len(variable_names_new) > 0:
            logging.warning('Found previously unseen Variables: {}'.format(variable_names_new))
            self.historical_info['names']['variable_names'].extend(variable_names_new)
        else:
            logging.info('Found no new Variables')
        
        if len(value_names_new) > 0:
            logging.warning('Found previously unseen value names: {}'.format(value_names_new))
            self.historical_info['names']['variable_names'].extend(value_names_new)
        else:
            logging.info('Found no new Values.')
        
        pass
    
    def check_types(self, df):
        """
        Checks if all the data types match up. Everything should be numeric except strings in config.
        """
        
        names_strings = df.select_dtypes('object').columns.values
        names_strings_unexpected = [name for name in names_strings if name not in self.historical_info['types']['string']]
        
        if len(names_strings_unexpected) > 0:
            logging.warning('Found variables are note expected to be "object" type: {}.'.format(names_strings_unexpected))
        else:
            logging.info('Found no new "ojbect" type variables.')
        pass
    
    def check_statistics(self, df):
        """
        Checks for statistical differences between historical_statistics and current batch.
        """
        
        # Get current batch statistics, add missing value percentage as well as number of samples.
        statistics = df.select_dtypes(exclude='object').describe().T
        statistics = statistics[['mean', 'std', 'min', 'max']]
        statistics['missing'] = df.isna().mean()
        statistics['samples'] = df.shape[0]
        
        # Split variables into ones with historical statistical data and ones without.
        variables_existing = [name for name in statistics.index.values if name in self.historical_info['statistics'].keys()]
        variables_new = [name for name in statistics.index.values if name not in self.historical_info['statistics'].keys()]
        
        # Save new variable statistics and report. 
        for variable in variables_new:
            self.historical_info['statistics'][variable] = statistics.loc[variable].to_dict()
        
        logging.info('Saved new statistics for Variables: {}'.format(variables_new))

            
        # STATISTICAL TESTS.
        variables_failed_test = []
        variables_failed_missing = []
        for variable in variables_existing:
            
            # Perform a t-test, add to variables_failed if it failed the test.
            p_value = self.t_test(self.historical_info['statistics'][variable], statistics.loc[variable].to_dict())
            if p_value <= self.p_value:
                variables_failed_test.append(variable)
                
            # Compare missing values with self.missing_deviation to see if it's more than expected.
            missing_difference =abs(self.historical_info['statistics'][variable]['missing'] - statistics.loc[variable]['missing'])
            if missing_difference >= self.missing_deviation:
                variables_failed_missing.append(variable)
                
                
            # Update the historical info of existing variables.
            # Mean.
            samples_new = (self.historical_info['statistics'][variable]['samples'] + statistics.loc[variable]['samples'])
            mean_new = (self.historical_info['statistics'][variable]['mean'] + statistics.loc[variable]['mean']) / samples_new
            
            # Update the standard deviation. Source: https://math.stackexchange.com/questions/775391/can-i-calculate-the-new-standard-deviation-when-adding-a-value-without-knowing-t
            std_new = np.sqrt(
                (samples_new -2) * statistics.loc[variable]['std'] + ()
            )
            
        # SAVE THE EXISTING VARIABLE STATS
    
        pass
        

format_verifier = FormatVerifier()
#format_verifier.check_names(df)
#format_verifier.check_types(df)
format_verifier.check_statistics(df)

2020-05-22 14:51:47,231 [INFO] Saved new statistics for Variables: ['Apsauga_KodinėLaiptinėsSpyna', 'Apsauga_Videokameros', 'Apsauga_ŠarvuotosDurys', 'ArtimiausiaParduotuvė', 'ArtimiausiasDarželis', 'Aukštas', 'AukštųSk', 'KainaMėn', 'KambariųSk', 'Metai', 'Nusikaltimai500MSpinduliuPraėjusįMėnesį', 'PapildomaĮranga_DušoKabina', 'PapildomaĮranga_Indaplovė', 'PapildomaĮranga_PlastikiniaiVamzdžiai', 'PapildomaĮranga_SkalbimoMašina', 'PapildomaĮranga_SuBaldais', 'PapildomaĮranga_VirtuvėsKomplektas', 'PapildomaĮranga_Viryklė', 'PapildomaĮranga_Šaldytuvas', 'PapildomosPatalpos_Balkonas', 'PapildomosPatalpos_Rūsys', 'Plotas', 'ViešojoTransportoStotelė', 'Ypatybės_Internetas', 'Ypatybės_KabelinėTelevizija', 'Ypatybės_NaujaElektrosInstaliacija', 'Ypatybės_NaujaKanalizacija', 'Ypatybės_VirtuvėSujungtaSuKambariu', 'ListingViewsToday', 'ListingViewsTotal', 'PapildomosPatalpos_VietaAutomobiliui', 'VidutiniškaiTiekKainuotųŠildymas1Mėn', 'Ypatybės_UždarasKiemas', 'PapildomaĮranga_Vonia', 'Ypatybės_Au

0.0


In [110]:
np.mean([1, 2, 3, 4])

2.5

In [114]:
np.sqrt(6/4)

1.224744871391589

In [116]:
np.std([1, 2, 3, 4], ddof=0)

1.118033988749895

In [None]:
#std deviation
s = np.sqrt((var_a + var_b)/2)
s



## Calculate the t-statistics
t = (a.mean() - b.mean())/(s*np.sqrt(2/N))



## Compare with the critical t-value
#Degrees of freedom
df = 2*N - 2

#p-value after comparison with the t 
p = 1 - stats.t.cdf(t,df=df)


print("t = " + str(t))
print("p = " + str(2*p))
### You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant.


## Cross Checking with the internal scipy function
t2, p2 = stats.ttest_ind(a,b)
print("t = " + str(t2))
print("p = " + str(p2))

In [73]:
"string": = ['ObjectDescription', 'ListingUrl', 'RealtorName', 'RealtorOrganization', 'BuildingEnergyClass', 'BuildingEnergyClassCategory', 'BuildingCity',  'BuildingNeighbourhood', 'BuildingStreet', 'PastatoTipas', 'Šildymas', 'Įrengimas', 'NamoNumeris', 'ButoNumeris', 'VidutiniškaiTiekKainuotųŠildymas1Mėn', 'PastatoEnergijosSuvartojimoKlasė']

SyntaxError: invalid syntax (<ipython-input-73-ffe0b0c2bc56>, line 1)