In [6]:
import math
import datetime
import pandas as pd
import numpy as np
import time
from statistics import median
from scipy.stats import iqr

import timeit
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
data_path = '../data/'

In [9]:
items = pd.read_csv('{}items.csv'.format(data_path), sep='|')
infos = pd.read_csv('{}infos.csv'.format(data_path), sep='|')
orders = pd.read_csv('{}orders0206_train.csv'.format(data_path), sep='|')


# Extract additional features on the time-dimension

In [4]:
orderss=orders.head(200)



orderss

Unnamed: 0,time,transactID,itemID,order,salesPrice
0,2018-01-01 00:01:56,2278968,450,1,17.42
1,2018-01-01 00:01:56,2278968,83,1,5.19
2,2018-01-01 00:07:11,2255797,7851,2,20.47
3,2018-01-01 00:09:24,2278968,450,1,17.42
4,2018-01-01 00:09:24,2278968,83,1,5.19
5,2018-01-01 00:39:26,2257125,9375,1,31.02
6,2018-01-01 00:51:59,2278968,450,1,17.42
7,2018-01-01 00:51:59,2278968,83,1,5.19
8,2018-01-01 00:51:59,2278968,19,1,77.64
9,2018-01-01 00:51:59,2278968,297,1,43.53


In [10]:
start = time.clock()

orders['daytime'] = pd.to_datetime(orders["time"])

# Month of the year
orders['month'] = pd.DatetimeIndex(orders['daytime']).month

# Day of the month
orders['day'] = pd.DatetimeIndex(orders['daytime']).day

# Weekday: Starts with 0 = Monday, 6 = Sunday
orders['weekday'] = pd.DatetimeIndex(orders['daytime']).weekday

# Just the date, no time
orders['time'] = pd.DatetimeIndex(orders['daytime']).time

# Hour of the day
orders['hour'] = pd.DatetimeIndex(orders['daytime']).hour

# Calendar week
orders['calendar_week'] = pd.DatetimeIndex(orders['daytime']).week


end = time.clock()
print(end - start)


1.4493960000000001


In [11]:
ex_items = items.copy()
ex_infos = infos.copy()
ex_orders = orders.copy()

# Feature generation: Price deviations

- Diff Sales vs Recommended Price

Item A: 60x für 30€, 30x für 10€

Features:
- Highest Price
- Units Sold with highest price
- Lowest Price
- Units Sold with highest price

In [12]:
joined = pd.merge(left=pd.merge(left=items, right=infos, on='itemID'), right=orders, on='itemID')

In [13]:
joined.head(5)

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,simulationPrice,promotion,time,transactID,order,salesPrice,daytime,month,day,weekday,hour,calendar_week
0,1,0,1,4.38,1,1,1,8.84,3.43,,16:46:03,2261889,1,3.11,2018-01-23 16:46:03,1,23,1,16,4
1,1,0,1,4.38,1,1,1,8.84,3.43,,13:32:34,2263523,1,3.11,2018-01-25 13:32:34,1,25,3,13,4
2,1,0,1,4.38,1,1,1,8.84,3.43,,11:06:00,162329,1,3.11,2018-01-29 11:06:00,1,29,0,11,5
3,1,0,1,4.38,1,1,1,8.84,3.43,,11:06:01,162330,1,3.11,2018-01-29 11:06:01,1,29,0,11,5
4,1,0,1,4.38,1,1,1,8.84,3.43,,11:06:01,162331,1,3.11,2018-01-29 11:06:01,1,29,0,11,5


In [16]:
joined['orders_priceDifference'] = joined['salesPrice'] - joined['recommendedRetailPrice']
joined['orders_relPriceDifference'] = joined['orders_priceDifference'] / joined['recommendedRetailPrice']

In [17]:
ex_orders['priceDifference'] = joined['orders_priceDifference']
ex_orders['relPriceDifference'] = joined['orders_relPriceDifference']

In [19]:
grouped_orders = orders.groupby("itemID")

In [34]:
#Minimal Price of an Item
if 'minSalesPrices' not in list(ex_items.columns):
    minprices = grouped_orders.min()["salesPrice"].rename('minSalesPrice')
    ex_items = pd.merge(left=ex_items, right=minprices, on='itemID')

In [36]:
#Maximal Price of an Item
if 'maxSalesPrices' not in list(ex_items.columns):
    maxprices = grouped_orders.max()["salesPrice"].rename('maxSalesPrice')
    ex_items = pd.merge(left=ex_items, right=maxprices, on='itemID')

In [37]:
#Mean Price of an Item
if 'meanSalesPrices' not in list(ex_items.columns):
    meanprices = grouped_orders.mean()["salesPrice"].rename('meanSalesPrice')
    ex_items = pd.merge(left=ex_items, right=meanprices, on='itemID')

In [38]:
ex_items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,minSalesPrice,maxSalesPrice,meanSalesPrice
0,1,0,1,4.38,1,1,1,8.84,3.11,3.11,3.11
1,2,0,2,3.0,1,2,1,16.92,9.15,9.15,9.15
2,3,0,3,5.0,1,3,1,15.89,9.89,14.04,11.918371
3,4,0,2,4.44,1,2,1,40.17,13.01,13.01,13.01
4,5,0,2,2.33,1,1,1,17.04,7.48,7.84,7.740377


In [39]:
counts = grouped_orders["salesPrice"].value_counts()

In [40]:
df = pd.merge(left=ex_items, right=counts, left_on=['itemID', 'minSalesPrice'], right_on=['itemID', 'salesPrice'])

ValueError: 'salesPrice' is both an index level and a column label, which is ambiguous.

In [77]:
counts = grouped_orders["salesPrice"].value_counts()
print(counts)

ex_items["items_minPriceCount"]= ex_items.apply(lambda row: counts[row["itemID"], row["items_minPrice"]])
ex_items["items_maxPriceCount"]= ex_items.apply(lambda row: counts[row["itemID"], row["items_maxPrice"]])



itemID  salesPrice
1       3.11           575
2       9.15             5
3       9.89            91
        14.04           87
4       13.01           50
5       7.84           115
        7.48            44
6       17.25          224
        13.81            1
7       34.39            1
8       5.33            91
        4.98            82
        4.26            24
9       199.84          10
10      25.69          312
        24.17           64
11      2.13           201
12      2.11           123
13      11.60          276
        10.83           24
14      29.67            4
15      716.60          20
        708.78           8
        706.85           7
        712.69           7
        714.65           7
        722.48           7
16      159.61          27
17      26.38          170
18      7.21           248
19      79.68          131
        77.64           81
20      605.04           4
21      5.44             6
22      37.38           46
23      31.58           63
24      1

KeyError: ('itemID', 'occurred at index itemID')

In [75]:
ex_orders.head(20)

Unnamed: 0,time,transactID,itemID,order,salesPrice,month,day,weekday,date,hour,calendar_week
0,2018-01-01 00:01:56,2278968,450,1,17.42,1,1,0,00:01:56,0,1
1,2018-01-01 00:01:56,2278968,83,1,5.19,1,1,0,00:01:56,0,1
2,2018-01-01 00:07:11,2255797,7851,2,20.47,1,1,0,00:07:11,0,1
3,2018-01-01 00:09:24,2278968,450,1,17.42,1,1,0,00:09:24,0,1
4,2018-01-01 00:09:24,2278968,83,1,5.19,1,1,0,00:09:24,0,1
5,2018-01-01 00:39:26,2257125,9375,1,31.02,1,1,0,00:39:26,0,1
6,2018-01-01 00:51:59,2278968,450,1,17.42,1,1,0,00:51:59,0,1
7,2018-01-01 00:51:59,2278968,83,1,5.19,1,1,0,00:51:59,0,1
8,2018-01-01 00:51:59,2278968,19,1,77.64,1,1,0,00:51:59,0,1
9,2018-01-01 00:51:59,2278968,297,1,43.53,1,1,0,00:51:59,0,1


In [78]:
ex_items.head(20)

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,salesPrice
0,1,0,1,4.38,1,1,1,8.84,3.11
1,2,0,2,3.0,1,2,1,16.92,9.15
2,3,0,3,5.0,1,3,1,15.89,9.89
3,4,0,2,4.44,1,2,1,40.17,13.01
4,5,0,2,2.33,1,1,1,17.04,7.48
5,6,0,2,4.2,1,2,1,20.9,13.81
6,7,0,3,4.0,1,3,1,26.4,34.39
7,8,0,4,5.0,1,1,1,14.13,4.26
8,9,0,5,5.0,1,1,1,165.06,199.84
9,10,0,4,5.0,1,1,1,19.43,24.17


# Sales


- number of unites sold
- % of Overall revenue

In [79]:
ex_orders.to_csv('{}ex_orders.csv'.format(data_path), index=False, sep='|')
ex_items.to_csv('{}ex_info.csv'.format(data_path), index=False, sep='|')
ex_infos.to_csv('{}ex_infos.csv'.format(data_path), index=False, sep='|')