In [1]:
def clean_data(fi, fo, header, suffix):
    head = fi.readline().strip("\n").split(",")
    head = [h.strip('"') for h in head]
    for i, h in enumerate(head):
        if h == "nomprov":
            ip = i
    print(ip)
    n = len(head)
    if header:
        fo.write("%s\n" % ",".join(head))

    print(n)
    for line in fi:
        fields = line.strip("\n").split(",")
        # this process is done because there are elements of 'nomprov' conatining comma
        if len(fields) > n:
            prov = fields[ip] + fields[ip+1]
            del fields[ip]
            fields[ip] = prov
        assert len(fields) == n
        fields = [field.strip() for field in fields]
        fo.write("%s%s\n" % (",".join(fields), suffix))

with open("data/8th.clean.all.csv", "w") as f:
    clean_data(open("data/train_ver2.csv"), f, True, "")
    comma24 = "".join(["," for i in range(24)])
    clean_data(open("data/test_ver2.csv"), f, False, comma24)

20
48
20
24


In [3]:
import gzip
import math
import pickle
import zlib
import io

import pandas as pd
import numpy as np

# import scipy.stats

from sklearn.preprocessing import LabelEncoder

import engines
from utils import *

np.random.seed(2016)

transformers = {}


# print unique count for single series
def assert_uniq(series, name):
    uniq = np.unique(series, return_counts=True)
    print("assert_uniq", name, uniq)

# LEARNT one hot encode only designated values in column
def custom_one_hot(df, features, name, names, dtype=np.int8, check=False):
    for n, val in names.items():
        new_name = "%s_%s" % (name, n)
        print(name, new_name)
        df[new_name] = df[name].map(lambda x: 1 if x == val else 0).astype(dtype)

        if check:
            assert_uniq(df[new_name], new_name)
        features.append(new_name)


# preprocessor for categorical columns
def label_encode(df, features, name):
    df[name] = df[name].astype('str')
    if name in transformers: # test
        df[name] = transformers[name].transform(df[name])
    else: # train
        transformers[name] = LabelEncoder()
        df[name] = transformers[name].fit_transform(df[name])
    features.append(name)


# extract top 100 frequent values and map it to original dataframe, out-of-rank is set to 0
def encode_top(s, count=100, dtype=np.int8):
    uniqs, freqs = np.unique(s, return_counts=True)
    top = sorted(zip(uniqs,freqs), key=lambda vk: vk[1], reverse = True)[:count]
    top_map = {uf[0]: l+1 for uf, l in zip(top, range(len(top)))}
    return s.map(lambda x: top_map.get(x, 0)).astype(dtype) ### LEARNT use of .map(lambda x: x)


# preprocessing of each columns
def apply_transforms(train_df):
    features = []
    with Timer("apply transforms"):
        label_encode(train_df, features, "canal_entrada") # simple label encode
        # label_encode(train_df, features, "nomprov") # use cod_prov only
        label_encode(train_df, features, "pais_residencia") # simple label encode

        train_df["age"] = train_df["age"].fillna(0.0).astype(np.int16) # simple int conversion
        features.append("age")

        train_df["renta"].fillna(1.0, inplace=True)
        train_df["renta_top"] = encode_top(train_df["renta"]) # rank by top 100 frequency
        assert_uniq(train_df["renta_top"], "renta_top")
        features.append("renta_top")
        train_df["renta"] = train_df["renta"].map(math.log) # log transform renta ### LEARNT direct transform
        features.append("renta")

        # LEARNT how to use if/else in .map(lambda x: x)
        train_df["antiguedad"] = train_df["antiguedad"].map(lambda x: 0.0 if x < 0 or math.isnan(x) else x+1.0).astype(np.int16)
        features.append("antiguedad")

        train_df["tipodom"] = train_df["tipodom"].fillna(0.0).astype(np.int8)
        features.append("tipodom")

        train_df["cod_prov"] = train_df["cod_prov"].fillna(0.0).astype(np.int8)
        features.append("cod_prov")

        train_df["fecha_dato_month"] = train_df["fecha_dato"].map(lambda x: int(x.split("-")[1])).astype(np.int8)
        features.append("fecha_dato_month")
        train_df["fecha_dato_year"] = train_df["fecha_dato"].map(lambda x: float(x.split("-")[0])).astype(np.int16)
        features.append("fecha_dato_year")
        # LEARNT use of x.__class__ for nan in fecha_alta
        train_df["fecha_alta_month"] = train_df["fecha_alta"].map(lambda x: 0.0 if x.__class__ is float else float(x.split("-")[1])).astype(np.int8)
        features.append("fecha_alta_month")
        train_df["fecha_alta_year"] = train_df["fecha_alta"].map(lambda x: 0.0 if x.__class__ is float else float(x.split("-")[0])).astype(np.int16)
        features.append("fecha_alta_year")

        # change date column to Y*12 + M float column
        train_df["fecha_dato_float"] = train_df["fecha_dato"].map(date_to_float)
        train_df["fecha_alta_float"] = train_df["fecha_alta"].map(date_to_float)

        train_df["dato_minus_alta"] = train_df["fecha_dato_float"] - train_df["fecha_alta_float"]
        features.append("dato_minus_alta")

        train_df["int_date"] = train_df["fecha_dato"].map(date_to_int).astype(np.int8)

        custom_one_hot(train_df, features, "indresi", {"n":"N"})
        custom_one_hot(train_df, features, "indext", {"s":"S"})
        custom_one_hot(train_df, features, "conyuemp", {"n":"N"})
        custom_one_hot(train_df, features, "sexo", {"h":"H", "v":"V"})
        custom_one_hot(train_df, features, "ind_empleado", {"a":"A", "b":"B", "f":"F", "n":"N"})
        custom_one_hot(train_df, features, "ind_nuevo", {"new":1})
        custom_one_hot(train_df, features, "segmento", {"top":"01 - TOP", "particulares":"02 - PARTICULARES", "universitario":"03 - UNIVERSITARIO"})
        custom_one_hot(train_df, features, "indfall", {"s":"S"})

        train_df["ind_actividad_cliente"] = train_df["ind_actividad_cliente"].map(lambda x: 0.0 if math.isnan(x) else x+1.0).astype(np.int8)
        features.append("ind_actividad_cliente")
        custom_one_hot(train_df, features, "indrel", {"1":1, "99":99})
        train_df["indrel_1mes"] = train_df["indrel_1mes"].map(lambda x: 5.0 if x == "P" else x).astype(float).fillna(0.0).astype(np.int8)
        assert_uniq(train_df["indrel_1mes"], "indrel_1mes")
        features.append("indrel_1mes")
        custom_one_hot(train_df, features, "tiprel_1mes", {"a":"A", "i":"I", "p":"P", "r":"R"}, check=True)

    return train_df, tuple(features)


# make lag feature by addint step to int_date. Lag of products only
def make_prev_df(train_df, step):
    with Timer("make prev%s DF" % step):
        prev_df = pd.DataFrame()
        prev_df["ncodpers"] = train_df["ncodpers"]
        # added step per int_date to shift the data, generating a lag-feature
        prev_df["int_date"] = train_df["int_date"].map(lambda x: x+step).astype(np.int8)
        prod_features = ["%s_prev%s" % (prod, step) for prod in products]
        for prod, prev in zip(products, prod_features):
            prev_df[prev] = train_df[prod]
    return prev_df, tuple(prod_features)


# load train data and apply transforms
def load_data(fname="data/8th.clean.all.csv"):
    with Timer("load train csv"):
        train_df = pd.read_csv(fname, dtype=dtypes)

    with Timer("fill products NA"):
        for prod in products:
            train_df[prod] = train_df[prod].fillna(0.0).astype(np.int8)

    train_df, features = apply_transforms(train_df)

    prev_dfs = []

    prod_features = None

    use_features = frozenset([1,2])
    for step in range(1,6):
        prev1_train_df, prod1_features = make_prev_df(train_df, step)
        prev_dfs.append(prev1_train_df)
        # use lag of 1,2 features
        if step in use_features:
            features += prod1_features
        if step == 1:
            prod_features = prod1_features

    return train_df, prev_dfs, features, prod_features
    # train_df = transformation applied to all data
    # prev_dfs = 5 lag dataframes
    # features = features to be used during transform + lag-1,2 features
    # prod_features = lag-1 feature names

# join existing dataframe with lag-5 produts columns
def join_with_prev(df, prev_df, how):
    with Timer("join %s" % how):
        assert set(df.columns.values.tolist()) & set(prev_df.columns.values.tolist()) == set(["ncodpers", "int_date"])
        print("before join", len(df))
        df = df.merge(prev_df, on=["ncodpers", "int_date"], how=how) ### LEARNT merging via two column conditions
        for f in set(prev_df.columns.values.tolist()) - set(["ncodpers", "int_date"]):
            df[f] = df[f].astype(np.float16)
        print("after join", len(df))
        return df


def make_data():
    train_df, prev_dfs, features, prod_features = load_data()

    for i, prev_df in enumerate(prev_dfs):
        with Timer("join train with prev%s" % (i+1)):
            how = "inner" if i == 0 else "left" # WHY inner first and left?
            train_df = join_with_prev(train_df, prev_df, how=how)

    # Various aggregates to try
    # for prod in products:
    #     print()
    #     print(prod)
    #     #prev1_bin = (train_df[prod + "_prev1"] != 1).astype(np.int8)
    #     for begin, end in [(2,5),(1,4)]:
    #         prods = ["%s_prev%s" % (prod, i) for i in range(begin,end+1)]
    #         mp_df = train_df.as_matrix(columns=prods)
    #         print(prods)
    #
    #         stdf = "%s_std_%s_%s" % (prod,begin,end)
    #         train_df[stdf] = np.nanstd(mp_df, axis=1) #  * prev1_bin
    #
    #         maxf = "%s_max_%s_%s"%(prod,begin,end)
    #         train_df[maxf] = np.nanmax(mp_df, axis=1).astype(np.int8)
    #
    #         # minf = "%s_min_%s_%s"%(prod,begin,end)
    #         # train_df[minf] = np.nanmin(mp_df, axis=1).astype(np.int8)
    #
    #         chf = "%s_ch_%s_%s"%(prod,begin,end)
    #         train_df[chf] = np.sum(np.invert(np.isclose(mp_df[:,1:], mp_df[:,:-1], equal_nan=True)), axis=1, dtype=np.int8)
    #
    #         sumf = "%s_sum_%s_%s"%(prod,begin,end)
    #         train_df[sumf] = np.nansum(mp_df, axis=1, dtype=np.int8)
    #
    #         skewf = "%s_skew_%s_%s"%(prod,begin,end)
    #         train_df[skewf] = scipy.stats.skew(mp_df, axis=1)
    #
    #         features += (stdf,maxf,chf,sumf,skewf)

    for prod in products:
        print()
        print(prod)
        for begin, end in [(1,3),(1,5),(2,5)]: ### LEARNT iterate over list of tuples
            prods = ["%s_prev%s" % (prod, i) for i in range(begin,end+1)]
            mp_df = train_df.as_matrix(columns=prods) ### LEARNT subsetting dataframe to numpy array via as_matrix
            print(prods)

            stdf = "%s_std_%s_%s" % (prod,begin,end)
            train_df[stdf] = np.nanstd(mp_df, axis=1) #  * prev1_bin

            features += (stdf,)

    for prod in products:
        print()
        print(prod)
        for begin, end in [(2,3),(2,5)]:
            prods = ["%s_prev%s" % (prod, i) for i in range(begin,end+1)]
            mp_df = train_df.as_matrix(columns=prods)
            print(prods)

            minf = "%s_min_%s_%s"%(prod,begin,end)
            train_df[minf] = np.nanmin(mp_df, axis=1).astype(np.int8)

            maxf = "%s_max_%s_%s"%(prod,begin,end)
            train_df[maxf] = np.nanmax(mp_df, axis=1).astype(np.int8)

            features += (minf,maxf,)

    with Timer("Remove unused columns"):
        leave_columns = ["ncodpers", "int_date", "fecha_dato"] + list(products) + list(features)
        assert len(leave_columns) == len(set(leave_columns))
        train_df = train_df[leave_columns]

    return train_df, features, prod_features
    # train_df = col-subsetted dataframe of trn/tst
    # features = features to be used during transform + lag-1,2 features + additionals
    # prod_features = lag-1 feature names

all_df, features, prod_features = make_data()
with Timer("save data"):
    all_df.to_pickle("data/8th.feature_engineer.all.pkl")
    pickle.dump((features, prod_features), open("data/8th.feature_engineer.cv_meta.pkl", "wb"))

load train csv...


  exec(code_obj, self.user_global_ns, self.user_ns)


load train csv: cpu 63.94, time 63.94

fill products NA...
fill products NA: cpu 37.97, time 37.98

apply transforms...
assert_uniq renta_top (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100], dtype=int8), array([11487211,  3022340,     5936,     1854,     1584,     1495,
           1444,     1416,     1256,     1218,     1114,     1076,
           1017,      952,      941,      899,      855,      844,
            823,      813,      756,      723,      718,      712,
            69

  keepdims=keepdims)



ind_aval_fin_ult1
['ind_aval_fin_ult1_prev1', 'ind_aval_fin_ult1_prev2', 'ind_aval_fin_ult1_prev3']
['ind_aval_fin_ult1_prev1', 'ind_aval_fin_ult1_prev2', 'ind_aval_fin_ult1_prev3', 'ind_aval_fin_ult1_prev4', 'ind_aval_fin_ult1_prev5']
['ind_aval_fin_ult1_prev2', 'ind_aval_fin_ult1_prev3', 'ind_aval_fin_ult1_prev4', 'ind_aval_fin_ult1_prev5']

ind_cco_fin_ult1
['ind_cco_fin_ult1_prev1', 'ind_cco_fin_ult1_prev2', 'ind_cco_fin_ult1_prev3']
['ind_cco_fin_ult1_prev1', 'ind_cco_fin_ult1_prev2', 'ind_cco_fin_ult1_prev3', 'ind_cco_fin_ult1_prev4', 'ind_cco_fin_ult1_prev5']
['ind_cco_fin_ult1_prev2', 'ind_cco_fin_ult1_prev3', 'ind_cco_fin_ult1_prev4', 'ind_cco_fin_ult1_prev5']

ind_cder_fin_ult1
['ind_cder_fin_ult1_prev1', 'ind_cder_fin_ult1_prev2', 'ind_cder_fin_ult1_prev3']
['ind_cder_fin_ult1_prev1', 'ind_cder_fin_ult1_prev2', 'ind_cder_fin_ult1_prev3', 'ind_cder_fin_ult1_prev4', 'ind_cder_fin_ult1_prev5']
['ind_cder_fin_ult1_prev2', 'ind_cder_fin_ult1_prev3', 'ind_cder_fin_ult1_prev4', 'i



['ind_ahor_fin_ult1_prev2', 'ind_ahor_fin_ult1_prev3', 'ind_ahor_fin_ult1_prev4', 'ind_ahor_fin_ult1_prev5']

ind_aval_fin_ult1
['ind_aval_fin_ult1_prev2', 'ind_aval_fin_ult1_prev3']
['ind_aval_fin_ult1_prev2', 'ind_aval_fin_ult1_prev3', 'ind_aval_fin_ult1_prev4', 'ind_aval_fin_ult1_prev5']

ind_cco_fin_ult1
['ind_cco_fin_ult1_prev2', 'ind_cco_fin_ult1_prev3']
['ind_cco_fin_ult1_prev2', 'ind_cco_fin_ult1_prev3', 'ind_cco_fin_ult1_prev4', 'ind_cco_fin_ult1_prev5']

ind_cder_fin_ult1
['ind_cder_fin_ult1_prev2', 'ind_cder_fin_ult1_prev3']
['ind_cder_fin_ult1_prev2', 'ind_cder_fin_ult1_prev3', 'ind_cder_fin_ult1_prev4', 'ind_cder_fin_ult1_prev5']

ind_cno_fin_ult1
['ind_cno_fin_ult1_prev2', 'ind_cno_fin_ult1_prev3']
['ind_cno_fin_ult1_prev2', 'ind_cno_fin_ult1_prev3', 'ind_cno_fin_ult1_prev4', 'ind_cno_fin_ult1_prev5']

ind_ctju_fin_ult1
['ind_ctju_fin_ult1_prev2', 'ind_ctju_fin_ult1_prev3']
['ind_ctju_fin_ult1_prev2', 'ind_ctju_fin_ult1_prev3', 'ind_ctju_fin_ult1_prev4', 'ind_ctju_fin_ult