# Test creation notebook

In [1]:
import sys
sys.path.append("../../main/datasets/")
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promotionAggregation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import xgboost as xgb
from datetime import datetime

NUMBER_OF_LAGS = 4


!ls  ../../main/datasets/

1.0v.zip


## Preparing our dataset
These steps were already seen on ../pre-processing-features notebooks.

In [2]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [3]:
# Changing our time signatures, 
# adding our promotion feature 
# and aggregating our data by weeks...
process_time(orders)
orders = promo_detector(orders)
df = promotionAggregation(orders, items)

In [4]:
def prepareOrders(orders, items):
    """This function is responsible for adding in our 'orders' dataframe
    the items that were not sold. THIS IS NOT MODULARIZED, THUS YOU
    SHOULD CHANGE THE CODE TO BETTER SUIT YOUR DATASET FEATURES
    """
    df = orders.copy()
    not_sold_items = items[np.logical_not(
        items.itemID.isin(sorted(orders['itemID'].unique())))]

    new_rows = []
    weeks_database = orders['group_backwards'].unique()

    for idd in df['itemID'].unique():
        orders_id = df[df.itemID == idd]
        example = orders_id.iloc[0]

        # finding weeks without itemID sales
        weeks_id = orders_id['group_backwards'].unique()
        weeks_without_id = np.setdiff1d(weeks_database, weeks_id)

        # creating new row
        for w in weeks_without_id:
            new_rows.append({'itemID': idd,
                             'group_backwards': w,
                             'salesPrice_mean': 0,
                             'customerRating': example['customerRating'],
                             'category1': example['category1'],
                             'category2': example['category2'],
                             'category3': example['category3'],
                             'recommendedRetailPrice': example['recommendedRetailPrice'],
                             'orderSum': 0,
                             'manufacturer': example['manufacturer'],
                             'brand': example['brand'],
                             'promotion_mean': 0
                             })

    df = df.append(new_rows)
    not_sold_orders = pd.DataFrame()
    for i in range(1, 14):
        aux = not_sold_items.copy()
        aux['group_backwards'] = i
        aux['salesPrice_mean'] = 0
        aux['promotion_mean'] = 0
        aux['orderSum'] = 0
        not_sold_orders = pd.concat([not_sold_orders, aux], axis=0)
    df = pd.concat([df, not_sold_orders], axis=0).sort_values(
        ['group_backwards', 'itemID'], ascending=[False, True], ignore_index=True)
    return df

In [7]:
test = prepareOrders(df, items)

In [4]:
not_sold_items = items[np.logical_not(
    items.itemID.isin(sorted(orders['itemID'].unique())))]

In [6]:
new_rows = []
weeks_database = orders['group_backwards'].unique()

for idd in df['itemID'].unique():
    orders_id = df[df.itemID == idd]
    example = orders_id.iloc[0]
    
    #finding weeks without itemID sales
    weeks_id = orders_id['group_backwards'].unique()
    weeks_without_id = np.setdiff1d(weeks_database , weeks_id)
    
    #creating new row
    for w in weeks_without_id:
        new_rows.append({'itemID':idd, 
                         'group_backwards': w, 
                         'salesPrice_mean': 0, 
                         'customerRating': example['customerRating'],
                         'category1': example['category1'],
                         'category2': example['category2'],
                         'category3': example['category3'],
                         'recommendedRetailPrice': example['recommendedRetailPrice'],
                         'orderSum':0,
                         'manufacturer': example['manufacturer'],
                         'brand': example['brand'],
                         'promotion_mean': 0
                        })

df = df.append(new_rows) 

In [7]:
# df.sort_values(['group_backwards', 'itemID'], ascending=[False, True],ignore_index=True)

In [8]:
test = pd.DataFrame()
for i in range(1, 14):
    aux = not_sold_items.copy()
    aux['group_backwards'] = i
    aux['salesPrice_mean'] = 0
    aux['promotion_mean'] = 0
    aux['orderSum'] = 0
    test = pd.concat([test, aux], axis=0)
test

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,group_backwards,salesPrice_mean,promotion_mean,orderSum
57,58,0,17,0.0,1,1,1,326.30,1,0,0,0
106,107,0,8,5.0,1,4,1,18.90,1,0,0,0
124,125,0,23,0.0,1,1,1,9.60,1,0,0,0
125,126,0,23,0.0,1,1,1,13.08,1,0,0,0
724,725,0,29,0.0,1,5,1,81.78,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10454,10455,241,251,0.0,8,44,8,1037.50,13,0,0,0
10455,10456,274,251,0.0,8,44,8,1400.10,13,0,0,0
10456,10457,274,251,0.0,8,44,8,1000.05,13,0,0,0
10457,10458,0,253,0.0,8,44,8,77.40,13,0,0,0


In [9]:
df = pd.concat([df, test], axis=0).sort_values(['group_backwards', 'itemID'], ascending=[False, True],ignore_index=True)

In [10]:
df.loc[df.itemID == 8]

Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
7,13,8,0,0.0,0.0,0.0,4.0,5.0,1.0,1.0,1.0,14.13
10470,12,8,2,0.0,5.33,0.0,4.0,5.0,1.0,1.0,1.0,14.13
20933,11,8,0,0.0,0.0,0.0,4.0,5.0,1.0,1.0,1.0,14.13
31396,10,8,1,0.0,5.33,0.0,4.0,5.0,1.0,1.0,1.0,14.13
41859,9,8,0,0.0,0.0,0.0,4.0,5.0,1.0,1.0,1.0,14.13
52322,8,8,0,0.0,0.0,0.0,4.0,5.0,1.0,1.0,1.0,14.13
62785,7,8,100,0.011236,5.326067,0.0,4.0,5.0,1.0,1.0,1.0,14.13
73248,6,8,26,1.0,4.2888,0.0,4.0,5.0,1.0,1.0,1.0,14.13
83711,5,8,0,0.0,0.0,0.0,4.0,5.0,1.0,1.0,1.0,14.13
94174,4,8,94,1.0,4.98,0.0,4.0,5.0,1.0,1.0,1.0,14.13
