In [61]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import kaggle_util 
import matplotlib.pyplot as plt
from lightgbm_avito import calcImgAtt

region_map = {"Свердловская область" : "Sverdlovsk oblast",
            "Самарская область" : "Samara oblast",
            "Ростовская область" : "Rostov oblast",
            "Татарстан" : "Tatarstan",
            "Волгоградская область" : "Volgograd oblast",
            "Нижегородская область" : "Nizhny Novgorod oblast",
            "Пермский край" : "Perm Krai",
            "Оренбургская область" : "Orenburg oblast",
            "Ханты-Мансийский АО" : "Khanty-Mansi Autonomous Okrug",
            "Тюменская область" : "Tyumen oblast",
            "Башкортостан" : "Bashkortostan",
            "Краснодарский край" : "Krasnodar Krai",
            "Новосибирская область" : "Novosibirsk oblast",
            "Омская область" : "Omsk oblast",
            "Белгородская область" : "Belgorod oblast",
            "Челябинская область" : "Chelyabinsk oblast",
            "Воронежская область" : "Voronezh oblast",
            "Кемеровская область" : "Kemerovo oblast",
            "Саратовская область" : "Saratov oblast",
            "Владимирская область" : "Vladimir oblast",
            "Калининградская область" : "Kaliningrad oblast",
            "Красноярский край" : "Krasnoyarsk Krai",
            "Ярославская область" : "Yaroslavl oblast",
            "Удмуртия" : "Udmurtia",
            "Алтайский край" : "Altai Krai",
            "Иркутская область" : "Irkutsk oblast",
            "Ставропольский край" : "Stavropol Krai",
            "Тульская область" : "Tula oblast"}

In [69]:
def log_nozero(values):
    return np.log(values + 0.001)

In [82]:
testing = pd.read_csv('../input/test.csv')
training = pd.read_csv('../input/train.csv')

In [83]:
train_features = pd.read_csv('../input/aggregated_features.csv')
regional = pd.read_csv('../input/regional.csv', index_col=0)
regional.index = regional.index.str.lower()
imgatt = pd.read_csv('../input/df_imgatt.csv')
for col in ['whratio', 'area', 'laplacian', 
            'colorfull', 'brightness', 'median',
           'rms', 'stddev']:
    imgatt[col] = log_nozero(imgatt[col])

In [84]:
def merge_att(df):
    df = df.merge(train_features, on = ['user_id'], how = 'left')
    df['avg_days_up_user'] = log_nozero(df['avg_days_up_user'].fillna(0))
    df['avg_times_up_user'] = log_nozero(df['avg_times_up_user'].fillna(0))
    df['n_user_items'] = log_nozero(df['n_user_items'].fillna(0))
    return df

In [85]:
def merge_regional(df):
    df['region'] = df['region'].apply(lambda x : region_map[x])
    df['region'] = df['region'].str.lower()
    df["reg_dense"] = log_nozero(df['region'].apply(lambda x: regional.loc[x,"Density_of_region(km2)"]))
    df["rural"] = df['region'].apply(lambda x: regional.loc[x,"Rural_%"])
    df["reg_Time_zone"] = df['region'].apply(lambda x: regional.loc[x,"Time_zone"])
    df["reg_Population"] = log_nozero(df['region'].apply(lambda x: regional.loc[x,"Total_population"]))
    df["reg_Urban"] = df['region'].apply(lambda x: regional.loc[x,"Urban%"])
    le = LabelEncoder()
    df["reg_Time_zone"] = le.fit_transform(df["reg_Time_zone"])
    return df

In [86]:
def merge_imgatt(df):
    df = df.merge(imgatt, on = ['image'], how = 'left')
    for col in imgatt.columns:
        if col == 'image':
            continue
        df[col] = df[col].fillna(0)
    return df

In [87]:
att_testing = merge_imgatt(merge_regional(merge_att(testing)))

In [91]:
att_training = merge_imgatt(merge_regional(merge_att(training)))

In [90]:
att_testing = kaggle_util.reduce_mem_usage(att_testing)

 58%|█████▊    | 21/36 [00:00<00:00, 207.79it/s]

Memory usage of dataframe is 143.53 MB


100%|██████████| 36/36 [00:00<00:00, 151.36it/s]

Memory usage after optimization is: 80.98 MB
Decreased by 43.6%





In [92]:
att_training = kaggle_util.reduce_mem_usage(att_training)

 57%|█████▋    | 21/37 [00:00<00:00, 188.06it/s]

Memory usage of dataframe is 435.87 MB


100%|██████████| 37/37 [00:00<00:00, 79.86it/s] 

Memory usage after optimization is: 242.31 MB
Decreased by 44.4%





In [93]:
att_testing.to_csv('../input/att_test.csv', index=False)
att_training.to_csv('../input/att_train.csv', index=False)