# Data input preparation

- Get a sample if the input dataframe is too large.
- Reduce number of possible values in a categorical variable.

In [2]:
%matplotlib inline
import os
import sys
sys.path.append(os.path.abspath('../'))
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_iris
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 99
import numpy as np
from datetime import datetime
from eda import describe_info, describe_numeric, describe_categorical, describe_datetime, describe_bivariate, describe_missing, describe_duplicates

## LOAD *Weather* data

In [53]:
path = 'https://raw.githubusercontent.com/jmquintana79/utilsDS/master/scripts/datasets/data/dataset.weather.csv.gz'
data = pd.read_csv(path)
data['datetime'] = pd.to_datetime(data['datetime'])
data['dtnow'] = [datetime(2022,1,1,12,0,0) for i in range(len(data))]
data['dtrandom'] = pd.to_datetime(np.sort(np.random.choice(pd.date_range('2015-01-01', '2018-01-01', freq='H'), len(data), replace=False)))
data.shape

(17544, 16)

### DATA PROCESSING

In [52]:
## data preparation previous to be analized
def preparation(df:pd.DataFrame, max_num_rows:int = 5000, max_size_cats:int = 5, verbose:bool = True)->pd.DataFrame:
    """
    Data preparation previous to be analized.
    df -- data to be prepared.
    max_num_rows -- maximum number of rows allowed without considering a sample (default, 5000).
    max_size_cats -- maximum number of possible values in a categorical variable to be allowed (default, 5).
    verbose -- display extra information (default, True).
    return -- processed data.
    """

    ## get random sample if there are too much data

    # validate
    if len(df) > max_num_rows:
        # get a random sample
        df = df.sample(max_num_rows, random_state = 8)
        # display
        if verbose:
            print(f"[warning] It has taken a random sample with {len(df)} records.")


    ## get simplified categorical columns reducing the number of possible values

    # get names of categorical columns
    cols_cat = df.select_dtypes(include=['object', 'int64', 'category', 'bool']).columns.values
    # validate
    if len(cols_cat) > 0:
        # loop of variables
        for col in cols_cat:
            # count categories
            temp = data[col].value_counts(normalize=True,sort=True,ascending=False,dropna=True)
            # collect names order by frequency
            c = temp.index.values
            # resize
            if len(v) > max_size_cats:
                # get columns to be replace by "other"
                #cols_to_keep = list(c[:max_size_cats-1])
                cols_to_replace = list(c[max_size_cats-1:])    
                # replace less frequent columns by "others"
                df[col] = df[col].apply(lambda x: "other" if x in cols_to_replace else x)
                # validate
                if len(df[col].dropna().unique()) != max_size_cats:
                    print(f"[error] something was wrong in column '{col}' reducing its possible values.")
                else:
                    if verbose:
                        print(f"[info] it was simplified the categorical variable '{col}'.")
            else:
                pass
            # clean
            del temp
    # return 
    return df

In [54]:
data_processed = preparation(data)

[info] it was simplified the categorical variable 'WD'.
[info] it was simplified the categorical variable 'cloud_coverage'.


In [55]:
data_processed.shape

(5000, 16)

In [58]:
data_processed['WD'].unique()

array(['SSE', 'other', 'NNW', 'NW', 'S', nan], dtype=object)

In [59]:
data_processed['cloud_coverage'].unique()

array([nan, 'other', '10-', '10', '0', '0+'], dtype=object)