In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

train_data_dir = Path('../data/original/train.csv')
stores_data_dir = Path('../data/original/stores.csv')
transactions_data_dir = Path('../data/original/transactions.csv')
holidays_events_data_dir = Path('../data/original/holidays_events.csv')
oil_data_dir = Path('../data/original/oil.csv')


training_datatype_map = {
    'store_nbr': 'uint8',
    'family': 'category',
    'sales': 'float32',
    'onpromotion': 'uint64',
}
stores_datatype_map = {
    'cluster': 'int8',
}

train_df = pd.read_csv(
    train_data_dir,
    dtype=training_datatype_map,
    parse_dates=['date'],
)
stores_df = pd.read_csv(stores_data_dir, dtype=stores_datatype_map)   
transactions_df = pd.read_csv(transactions_data_dir, parse_dates=['date']).sort_values(['date', 'store_nbr'])
holidays_events_df = pd.read_csv(holidays_events_data_dir, parse_dates=['date'])
oil_df = pd.read_csv(oil_data_dir, parse_dates=['date'])

In [3]:
train_df['time'] = (train_df['date'] -
                                 min(train_df['date'])).dt.days
train_df.head(6000)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,time
0,0,2013-01-01,1,AUTOMOTIVE,0.00,0,0
1,1,2013-01-01,1,BABY CARE,0.00,0,0
2,2,2013-01-01,1,BEAUTY,0.00,0,0
3,3,2013-01-01,1,BEVERAGES,0.00,0,0
4,4,2013-01-01,1,BOOKS,0.00,0,0
...,...,...,...,...,...,...,...
5995,5995,2013-01-04,27,"LIQUOR,WINE,BEER",58.00,0,3
5996,5996,2013-01-04,27,MAGAZINES,0.00,0,3
5997,5997,2013-01-04,27,MEATS,149.36,0,3
5998,5998,2013-01-04,27,PERSONAL CARE,114.00,0,3


In [6]:
pd.get_dummies(train_df['family'])

Unnamed: 0,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,GROCERY II,HARDWARE,HOME AND KITCHEN I,HOME AND KITCHEN II,HOME APPLIANCES,HOME CARE,LADIESWEAR,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3000884,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3000885,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3000886,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False


In [8]:
df = pd.concat([train_df, pd.get_dummies(train_df['family'])], axis=1)