In [1]:
import glob
import os
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [2]:
# 指定したファイルパスのcsvファイルを全部読み込む関数
def read_csv_all(target_path = '../input/'):
    for file in glob.glob(target_path+'*.csv'):
        raw_name = 'raw_' + file.replace(target_path,'').replace('.csv','')
        exec('{} = pd.read_csv(\'{}\')'.format(raw_name, file), globals())
        print('read {} as {}'.format(file, raw_name))

In [3]:
def reduce_mem_usage(df, verbose=False):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [4]:
read_csv_all()

read ../input/calendar.csv as raw_calendar
read ../input/sell_prices.csv as raw_sell_prices
read ../input/sales_train_validation.csv as raw_sales_train_validation
read ../input/sample_submission.csv as raw_sample_submission


In [5]:
NUM_ITEMS = raw_sales_train_validation.shape[0]  # 30490
DAYS_PRED = raw_sample_submission.shape[1] - 1  # 28

In [6]:
def encode_categorical(df, cols):
    for col in cols:
        # Leave NaN as it is.
        le = LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df

In [7]:
calendar = encode_categorical(
    raw_calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
).pipe(reduce_mem_usage)

sales = encode_categorical(
    raw_sales_train_validation, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
).pipe(reduce_mem_usage)
del raw_sales_train_validation

prices = encode_categorical(raw_sell_prices, ["item_id", "store_id"]).pipe(reduce_mem_usage)

In [9]:
submission = raw_sample_submission
del raw_sample_submission

In [10]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [11]:
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# salesをDMの形に変換
id_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
sales = sales.melt(id_vars=id_columns, var_name="d", value_name="demand",)
sales = reduce_mem_usage(sales)

# get product table.
product = sales[id_columns]

In [15]:
# submissionをDMの形に変換

# separate test dataframes.
vals = submission[submission["id"].str.endswith("validation")]
evals = submission[submission["id"].str.endswith("evaluation")]

# change column names.
vals.columns = ["id"] + [f"d_{d}" for d in range(1914, 1914 + DAYS_PRED)]
evals.columns = ["id"] + [f"d_{d}" for d in range(1942, 1942 + DAYS_PRED)]

In [None]:
# merge with product table
evals["id"] = evals["id"].str.replace("_evaluation", "_validation")
vals = vals.merge(product, how="left", on="id")
evals = vals.merge(product, how="left", on="id")
evals["id"] = evals["id"].str.replace("_validation", "_evaluation")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
if verbose:
    print("validation")
    display(vals)

    print("evaluation")
    display(evals)

In [None]:
vals = vals.melt(id_vars=id_columns, var_name="d", value_name="demand")
evals = evals.melt(id_vars=id_columns, var_name="d", value_name="demand")