In [75]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [76]:
from markdown_predictions.parse_data import LoadSalesData

In [77]:
load_data_sales = LoadSalesData.load_in_files("raw_data")

In [78]:
df = load_data_sales.sales_data

### Dropping all lines without a product reference

In [79]:
df = df[df.reference_PRE.notnull()]

In [80]:
df.tail(1)

Unnamed: 0,delivery_PRE,window_display_PRE,target_PRE,gender_PRE,product_category_PRE,family_PRE,sub_family_PRE,macro_category_PRE,reference_name_PRE,reference_PRE,...,cum_quantity_sold_POST,num_sizes_POST,num_stores_POST,first_week_sale_POST,rate_of_sale_POST,cum_sellthrough_POST,warehouse_stock_POST,zero_stock_POST,avail_warehouse_stock_POST,%_POST
12929,17 SELECTIF VET HIVER,JOUR FEMME P0,3-ADULTE,FE,JOUR,FA MAILLOTS DE BAIN VET,MAILLOTS DE BAIN FILLE VET,Adulte Jour,LUDO,2638017.0,...,1,2,2,2018-S16,0.02,100 %,0,0,0,0%


In [81]:
df.columns[df.dtypes==object]

Index(['delivery_PRE', 'window_display_PRE', 'target_PRE', 'gender_PRE',
       'product_category_PRE', 'family_PRE', 'sub_family_PRE',
       'macro_category_PRE', 'reference_name_PRE', 'reference_PRE',
       'description_PRE', 'color_PRE', 'material_PRE', 'seasonality_PRE',
       'price_PRE', 'turnover_PRE', 'sub_target_PRE', 'turnover_w_sub1_PRE',
       'turnover_w_sub2_PRE', 'turnover_w_sub3_PRE', 'quantity_sold_PRE',
       'quantity_sold_sub1_PRE', 'quantity_sold_sub2_PRE',
       'quantity_sold_sub3_PRE', 'discount_rate_PRE', 'discount_rate_sub1_PRE',
       'store_stock_PRE', 'stock_transit_PRE', 'total_store_stock_PRE',
       'weekly_cover_PRE', 'cum_turnover_PRE', 'cum_discount_rate_PRE',
       'cum_quantity_sold_PRE', 'num_sizes_PRE', 'num_stores_PRE',
       'first_week_sale_PRE', 'rate_of_sale_PRE', 'cum_sellthrough_PRE',
       'warehouse_stock_PRE', 'zero_stock_PRE', 'avail_warehouse_stock_PRE',
       '%_PRE', 'season_PRE', 'reference_POST', 'seasonality_POST',
   

In [82]:
df.turnover_w_sub1_PRE

0             0  €
1        16,998  €
2        15,372  €
3        15,834  €
4        13,387  €
           ...    
12925       287  €
12926         0  €
12927     1,215  €
12928       253  €
12929         0  €
Name: turnover_w_sub1_PRE, Length: 12924, dtype: object

In [83]:
df.isnull().sum()

delivery_PRE                  0
window_display_PRE            0
target_PRE                    0
gender_PRE                    0
product_category_PRE          0
                             ..
cum_sellthrough_POST          0
warehouse_stock_POST          0
zero_stock_POST               0
avail_warehouse_stock_POST    0
%_POST                        0
Length: 74, dtype: int64

In [84]:
df.dtypes

delivery_PRE                  object
window_display_PRE            object
target_PRE                    object
gender_PRE                    object
product_category_PRE          object
                               ...  
cum_sellthrough_POST          object
warehouse_stock_POST          object
zero_stock_POST                int64
avail_warehouse_stock_POST    object
%_POST                        object
Length: 74, dtype: object

In [85]:
df_new = df.apply(lambda x: str(x).replace("%", ""))

### Getting rid of euro sing and space

In [86]:
euro_col = [col for col, data in df.items() if "€" in str(data[0])]

In [87]:
df[euro_col] = df[euro_col].replace({r' +\€':''}, regex = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Getting rid of commas in the DataFrame

In [88]:
df[euro_col] = df[euro_col].replace(',','', regex=True)

### Getting rid of %

In [89]:
#pct_col = [col for col, data in df.items() if "%" in str(data[0])]

In [90]:
#pct_col

In [91]:
#df[pct_col] = df[pct_col].replace({r'([ +]?)%':''}, regex = True)

In [92]:
df.replace({r'([ +]?)%':''}, regex = True)

Unnamed: 0,delivery_PRE,window_display_PRE,target_PRE,gender_PRE,product_category_PRE,family_PRE,sub_family_PRE,macro_category_PRE,reference_name_PRE,reference_PRE,...,cum_quantity_sold_POST,num_sizes_POST,num_stores_POST,first_week_sale_POST,rate_of_sale_POST,cum_sellthrough_POST,warehouse_stock_POST,zero_stock_POST,avail_warehouse_stock_POST,%_POST
0,PERM SELECTIF EARLY SEASON,PERM S VET EARLY ADF_PERMANENT,3-ADULTE,FE,JOUR,FA GROSSES PIECES VET,"CIRES, COUPE VENT VET",Adulte Jour,BISTRAL,4942401.0,...,566,7,125,2018-S50,0.15,99,0,0,0,0
1,AH2018 SELECTIF EARLY SEASON,18 S VET EARLY AD_JOUR_MARIN D,3-ADULTE,FE,JOUR,FA GROSSES PIECES VET,"CIRES, COUPE VENT VET",Adulte Jour,TYLORETTE,4417801.0,...,3054,6,137,2018-S30,0.74,100,0,0,0,0
2,AH2018 SELECTIF HIGH SEASON,18 S VET HIGH AD_MAILLE ENCHAN,3-ADULTE,FE,JOUR,FA ROBES VET,ROBES ML VET,Adulte Jour,TULLETOILE,4663401.0,...,1066,5,136,2018-S45,0.49,60,1,0,0,30
3,AH2018 SELECTIF HIGH SEASON,18 S VET HIGH AD_JOUR HIVER CH,3-ADULTE,FE,JOUR,FA CACHCOEUR+CARD.+GILET J VET,VESTES LEGERES,Adulte Jour,TUBICOUR,4450301.0,...,2065,5,136,2018-S40,0.89,96,-1,0,0,30
4,AH2018 SELECTIF HIGH SEASON,18 S VET HIGH AD_JOUR HIVER CH,3-ADULTE,FE,JOUR,FA ROBES VET,ROBES ML VET,Adulte Jour,TWIGGY,4550401.0,...,1676,5,136,2018-S40,0.56,89,1,0,0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12925,18 SELECTIF VET ETE,BB COUCHE P1,1-BEBE,MI,JOUR,FA GROSSES PIECES VET,VESTES EPAISSES VET,Bébé couché,MAIBLE,4330484.0,...,434,4,35,2018-S01,0.46,86,0,0,0,50
12926,18 SELECTIF ETE SPECIFIQUE,CCS02 CADEAUX BEBE COUCHE,1-BEBE,MI,NUIT,FA COLIS ET LOTS VET,LOT NUIT VET,Bébé couché,-,4382799.0,...,12,5,1,2018-S04,0.44,100,0,0,0,0
12927,18 SELECTIF ETE SPECIFIQUE,CCS34 ICONICOLOR CIRES ADULTE,3-ADULTE,FE,JOUR,FA GROSSES PIECES VET,"CIRES, COUPE VENT VET",Adulte Jour,MONTH,2775017.0,...,135,6,10,2018-S03,0.50,99,0,0,0,0
12928,18 SELECTIF ETE SPECIFIQUE,CCS05 ROBE POLORAMA,3-ADULTE,FE,JOUR,FA ROBES VET,ROBES MC VET,Adulte Jour,-,4412277.0,...,97,5,1,2018-S13,6.06,17,149,0,149,40


In [93]:
df[df.discount_rate_PRE == '-']

Unnamed: 0,delivery_PRE,window_display_PRE,target_PRE,gender_PRE,product_category_PRE,family_PRE,sub_family_PRE,macro_category_PRE,reference_name_PRE,reference_PRE,...,cum_quantity_sold_POST,num_sizes_POST,num_stores_POST,first_week_sale_POST,rate_of_sale_POST,cum_sellthrough_POST,warehouse_stock_POST,zero_stock_POST,avail_warehouse_stock_POST,%_POST
1810,AH2018 SELECTIF EARLY SEASON,18 S VET EARLY JRBB-RDC GRAPHI,1-BEBE,MI,AISV,FA TEE SHIRTS S/V,TEE SHIRTS ML S/V,Bébé Debout,LOT 2 TSML,4646991.0,...,1,0,0,2018-S36,-,50 %,0,0,0,0%
1811,PERM SELECTIF EARLY SEASON,PERM S SVET EARLY PERM SV,2-ENFANT,FE,SS VET,FA CULOTTES S/V,CULOTTES S/V,Enfant Sous-Vêt,3 CULOTTES,4901000.0,...,0,0,0,-,-,-,0,0,0,0%
1812,17 SELECTIF VET ETE,BB F P4,1-BEBE,FE,JOUR,FA MAILLOTS DE BAIN VET,MAILLOTS DE BAIN FILLE VET,Bébé Debout,FLAVIO,2204760.0,...,0,0,0,-,-,-,0,0,0,0%
1813,17 SELECTIF SVT ETE,G PT LIGNE ICONIQUE P1,2-ENFANT,MA,SS VET,FA DEBARDEURS S/V,DEBARDEURS S/V,Enfant Sous-Vêt,-,2373400.0,...,0,0,0,-,-,-,0,0,0,0%
1814,PERM SELECTIF EARLY SEASON,PERM S SVET EARLY PERM SV,1-BEBE,MI,SS VET,FA BODIES SM,SF BODIES SM,Bébé Sous-Vêt,LOT X3 BOD,4879800.0,...,0,0,0,-,-,-,0,0,0,0%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12909,17 SELECTIF VET ETE,BB G P1,1-BEBE,MA,JOUR,FA SALOPETTES VET,SALOPETTES COURTES VET,Bébé Debout,FRACAS,2281282.0,...,3,1,3,2018-S08,0.04,100 %,0,0,0,0%
12910,17 SELECTIF SVT ETE,SOUS VETEMENT FEMME P0,3-ADULTE,FE,SS VET,FA PANTALON S/V,PANTALON S/V,Adulte Sous-Vêt,-,2229717.0,...,1,1,1,2018-S04,0.04,100 %,0,0,0,0%
12911,17 SELECTIF VET ETE,NUIT PTE FILLE P2,2-ENFANT,FE,NUIT,FA PYJAMAS VET,PYJACOURTS FILLE VET,Enfant Nuit,FIABLE,2257563.0,...,1,1,1,2018-S24,0.04,7 %,260,0,260,20%
12912,17 SELECTIF VET HIVER,BB G P0,1-BEBE,MA,JOUR,FA PANTALONS JOUR VET,PANTALONS JOUR VET,Bébé Debout,LIEMO,2663928.0,...,0,2,2,2018-S17,0.00,0 %,529,0,529,50%


### Looking for ..

In [94]:
df.first_week_sale_POST

0        2018-S50
1        2018-S30
2        2018-S45
3        2018-S40
4        2018-S40
           ...   
12925    2018-S01
12926    2018-S04
12927    2018-S03
12928    2018-S13
12929    2018-S16
Name: first_week_sale_POST, Length: 12924, dtype: object

In [95]:
for col in df.columns:
    try:
        df[col] = df[col].apply(lambda x: x.replace(',', ''))
        df[col] = df[col].astype(float)
    except ValueError:
        print (f'leaving this {col} column as string')

leaving this delivery_PRE column as string
leaving this window_display_PRE column as string
leaving this target_PRE column as string
leaving this gender_PRE column as string
leaving this product_category_PRE column as string
leaving this family_PRE column as string
leaving this sub_family_PRE column as string
leaving this macro_category_PRE column as string
leaving this reference_name_PRE column as string


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(lambda x: x.replace(',', ''))


AttributeError: 'float' object has no attribute 'replace'

if number in range 50-26 count from number until 26 (there are 52 weeks)
if number in range 27-49 count from number until 49

In [96]:
df['first_week_sale_POST']

0        2018-S50
1        2018-S30
2        2018-S45
3        2018-S40
4        2018-S40
           ...   
12925    2018-S01
12926    2018-S04
12927    2018-S03
12928    2018-S13
12929    2018-S16
Name: first_week_sale_POST, Length: 12924, dtype: object

In [97]:
df['week']=df.first_week_sale_POST.str[-2:]
#df['week']=pd.to_numeric(df.week, errors='coerce', downcast='integer')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['week']=df.first_week_sale_POST.str[-2:]


In [101]:
winter=list(range(27,50))
summer=list(range(50,54))+list(range(1,27))

In [112]:
winter

[27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49]

In [102]:
def calculate_weeks(start_week):
    if start_week == '-':
        return 0
    if int(start_week) in summer:
        return len(summer)- summer.index(int(start_week))
        
    return len(winter)- winter.index(int(start_week))

In [103]:
df['weeks_sold_full_price']=df.week.map(calculate_weeks)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weeks_sold_full_price']=df.week.map(calculate_weeks)


In [106]:
df.columns

Index(['delivery_PRE', 'window_display_PRE', 'target_PRE', 'gender_PRE',
       'product_category_PRE', 'family_PRE', 'sub_family_PRE',
       'macro_category_PRE', 'reference_name_PRE', 'reference_PRE',
       'description_PRE', 'color_PRE', 'material_PRE', 'seasonality_PRE',
       'price_PRE', 'weekly_rank_PRE', 'turnover_PRE', 'sub_target_PRE',
       'turnover_w_sub1_PRE', 'turnover_w_sub2_PRE', 'turnover_w_sub3_PRE',
       'quantity_sold_PRE', 'quantity_sold_sub1_PRE', 'quantity_sold_sub2_PRE',
       'quantity_sold_sub3_PRE', 'discount_rate_PRE', 'discount_rate_sub1_PRE',
       'store_stock_PRE', 'stock_transit_PRE', 'total_store_stock_PRE',
       'weekly_cover_PRE', 'cum_turnover_PRE', 'cum_discount_rate_PRE',
       'cum_quantity_sold_PRE', 'num_sizes_PRE', 'num_stores_PRE',
       'first_week_sale_PRE', 'rate_of_sale_PRE', 'cum_sellthrough_PRE',
       'warehouse_stock_PRE', 'zero_stock_PRE', 'avail_warehouse_stock_PRE',
       '%_PRE', 'season_PRE', 'reference_POST', 'sea

In [104]:
df.weeks_sold_full_price

0        30
1        20
2         5
3        10
4        10
         ..
12925    26
12926    23
12927    24
12928    14
12929    11
Name: weeks_sold_full_price, Length: 12924, dtype: int64

In [110]:
df['avg_sales_full_price'] = df.cum_quantity_sold_POST - df.quantity_sold_POST - df.quantity_sold_sub1_POST / df.weeks_sold_full_price

TypeError: unsupported operand type(s) for -: 'str' and 'str'