In [71]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
from markdown_predictions.parse_data import LoadSalesData

In [73]:
load_data_sales = LoadSalesData.load_in_files("raw_data")

In [74]:
df = load_data_sales.sales_data

### Dropping all lines without a product reference

In [75]:
df = df[df.reference_PRE.notnull()]

In [76]:
df.tail(1)

Unnamed: 0,delivery_PRE,window_display_PRE,target_PRE,gender_PRE,product_category_PRE,family_PRE,sub_family_PRE,macro_category_PRE,reference_name_PRE,reference_PRE,...,cum_quantity_sold_POST,num_sizes_POST,num_stores_POST,first_week_sale_POST,rate_of_sale_POST,cum_sellthrough_POST,warehouse_stock_POST,zero_stock_POST,avail_warehouse_stock_POST,%_POST
12929,17 SELECTIF VET HIVER,JOUR FEMME P0,3-ADULTE,FE,JOUR,FA MAILLOTS DE BAIN VET,MAILLOTS DE BAIN FILLE VET,Adulte Jour,LUDO,2638017.0,...,1,2,2,2018-S16,0.02,100 %,0,0,0,0%


In [77]:
df.columns[df.dtypes==object]

Index(['delivery_PRE', 'window_display_PRE', 'target_PRE', 'gender_PRE',
       'product_category_PRE', 'family_PRE', 'sub_family_PRE',
       'macro_category_PRE', 'reference_name_PRE', 'reference_PRE',
       'description_PRE', 'color_PRE', 'material_PRE', 'seasonality_PRE',
       'price_PRE', 'turnover_PRE', 'sub_target_PRE', 'turnover_w_sub1_PRE',
       'turnover_w_sub2_PRE', 'turnover_w_sub3_PRE', 'quantity_sold_PRE',
       'quantity_sold_sub1_PRE', 'quantity_sold_sub2_PRE',
       'quantity_sold_sub3_PRE', 'discount_rate_PRE', 'discount_rate_sub1_PRE',
       'store_stock_PRE', 'stock_transit_PRE', 'total_store_stock_PRE',
       'weekly_cover_PRE', 'cum_turnover_PRE', 'cum_discount_rate_PRE',
       'cum_quantity_sold_PRE', 'num_sizes_PRE', 'num_stores_PRE',
       'first_week_sale_PRE', 'rate_of_sale_PRE', 'cum_sellthrough_PRE',
       'warehouse_stock_PRE', 'zero_stock_PRE', 'avail_warehouse_stock_PRE',
       '%_PRE', 'season_PRE', 'reference_POST', 'seasonality_POST',
   

In [78]:
df.turnover_w_sub1_PRE

0             0  €
1        16,998  €
2        15,372  €
3        15,834  €
4        13,387  €
           ...    
12925       287  €
12926         0  €
12927     1,215  €
12928       253  €
12929         0  €
Name: turnover_w_sub1_PRE, Length: 12924, dtype: object

In [79]:
df.isnull().sum()

delivery_PRE                  0
window_display_PRE            0
target_PRE                    0
gender_PRE                    0
product_category_PRE          0
                             ..
cum_sellthrough_POST          0
warehouse_stock_POST          0
zero_stock_POST               0
avail_warehouse_stock_POST    0
%_POST                        0
Length: 74, dtype: int64

### Getting rid of euro sing and space

In [80]:
euro_col = [col for col, data in df.items() if "€" in str(data[0])]

In [81]:
df[euro_col] = df[euro_col].replace({r' +\€':''}, regex = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Getting rid of commas in the DataFrame

In [82]:
df[euro_col] = df[euro_col].replace(',','', regex=True)

### Getting rid of %

In [85]:
pct_col = [col for col, data in df.items() if "%" in str(data[0])]

In [89]:
pct_col

['discount_rate_PRE',
 'discount_rate_sub1_PRE',
 'cum_discount_rate_PRE',
 'cum_sellthrough_PRE',
 '%_PRE',
 'cum_discount_rate_POST',
 'cum_sellthrough_POST',
 '%_POST']

In [90]:
df[pct_col] = df[pct_col].replace({r'([ +]?)%':''}, regex = True)

In [98]:
df[df.discount_rate_PRE == '-']

Unnamed: 0,delivery_PRE,window_display_PRE,target_PRE,gender_PRE,product_category_PRE,family_PRE,sub_family_PRE,macro_category_PRE,reference_name_PRE,reference_PRE,...,cum_quantity_sold_POST,num_sizes_POST,num_stores_POST,first_week_sale_POST,rate_of_sale_POST,cum_sellthrough_POST,warehouse_stock_POST,zero_stock_POST,avail_warehouse_stock_POST,%_POST
1810,AH2018 SELECTIF EARLY SEASON,18 S VET EARLY JRBB-RDC GRAPHI,1-BEBE,MI,AISV,FA TEE SHIRTS S/V,TEE SHIRTS ML S/V,Bébé Debout,LOT 2 TSML,4646991.0,...,1,0.0,0.0,2018-S36,-,50,0,0.0,0,0.0
1811,PERM SELECTIF EARLY SEASON,PERM S SVET EARLY PERM SV,2-ENFANT,FE,SS VET,FA CULOTTES S/V,CULOTTES S/V,Enfant Sous-Vêt,3 CULOTTES,4901000.0,...,0,0.0,0.0,-,-,-,0,0.0,0,0.0
1812,17 SELECTIF VET ETE,BB F P4,1-BEBE,FE,JOUR,FA MAILLOTS DE BAIN VET,MAILLOTS DE BAIN FILLE VET,Bébé Debout,FLAVIO,2204760.0,...,0,0.0,0.0,-,-,-,0,0.0,0,0.0
1813,17 SELECTIF SVT ETE,G PT LIGNE ICONIQUE P1,2-ENFANT,MA,SS VET,FA DEBARDEURS S/V,DEBARDEURS S/V,Enfant Sous-Vêt,-,2373400.0,...,0,0.0,0.0,-,-,-,0,0.0,0,0.0
1814,PERM SELECTIF EARLY SEASON,PERM S SVET EARLY PERM SV,1-BEBE,MI,SS VET,FA BODIES SM,SF BODIES SM,Bébé Sous-Vêt,LOT X3 BOD,4879800.0,...,0,0.0,0.0,-,-,-,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12909,17 SELECTIF VET ETE,BB G P1,1-BEBE,MA,JOUR,FA SALOPETTES VET,SALOPETTES COURTES VET,Bébé Debout,FRACAS,2281282.0,...,3,1.0,3.0,2018-S08,0.04,100,0,0.0,0,0.0
12910,17 SELECTIF SVT ETE,SOUS VETEMENT FEMME P0,3-ADULTE,FE,SS VET,FA PANTALON S/V,PANTALON S/V,Adulte Sous-Vêt,-,2229717.0,...,1,1.0,1.0,2018-S04,0.04,100,0,0.0,0,0.0
12911,17 SELECTIF VET ETE,NUIT PTE FILLE P2,2-ENFANT,FE,NUIT,FA PYJAMAS VET,PYJACOURTS FILLE VET,Enfant Nuit,FIABLE,2257563.0,...,1,1.0,1.0,2018-S24,0.04,7,260,0.0,260,20.0
12912,17 SELECTIF VET HIVER,BB G P0,1-BEBE,MA,JOUR,FA PANTALONS JOUR VET,PANTALONS JOUR VET,Bébé Debout,LIEMO,2663928.0,...,0,2.0,2.0,2018-S17,0.00,0,529,0.0,529,50.0


In [91]:
for col in df.columns:
    try:
        df[col] = df[col].astype(float)
    except ValueError:
        print (f'leaving this {col} column as string')

leaving this delivery_PRE column as string
leaving this window_display_PRE column as string
leaving this target_PRE column as string
leaving this gender_PRE column as string
leaving this product_category_PRE column as string
leaving this family_PRE column as string
leaving this sub_family_PRE column as string
leaving this macro_category_PRE column as string
leaving this reference_name_PRE column as string
leaving this reference_PRE column as string
leaving this description_PRE column as string
leaving this color_PRE column as string
leaving this material_PRE column as string
leaving this seasonality_PRE column as string
leaving this price_PRE column as string
leaving this sub_target_PRE column as string
leaving this discount_rate_PRE column as string
leaving this discount_rate_sub1_PRE column as string
leaving this store_stock_PRE column as string
leaving this stock_transit_PRE column as string
leaving this total_store_stock_PRE column as string
leaving this weekly_cover_PRE column as 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(float)
