In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tailor

In [2]:
data_raw = tailor.load_data()

In [3]:
data_raw.head()

Unnamed: 0,article_id,time_on_sale,original_price,discount,markdown,sells_price,stock_total,avq,article_count,revenue,brand,color,Abteilung,WHG,WUG,month,season
0,900001,0,59.95,0.64,0.0,59.31,1499,0.72,22,1304.9,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,March,Spring
1,900001,1,59.95,0.14,0.0,59.81,1499,3.1,39,2332.61,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,March,Spring
2,900001,2,59.95,0.84,0.0,59.11,1499,5.3,32,1891.4,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,March,Spring
3,900001,3,59.95,0.81,0.0,59.14,1499,7.2,27,1596.69,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,March,Spring
4,900001,4,59.95,0.35,0.0,59.6,1499,9.34,34,2026.3,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,March,Spring


In [4]:
# checking whether there are any NaN cells
data_raw.isnull().sum()

article_id        0
time_on_sale      0
original_price    0
discount          0
markdown          0
sells_price       0
stock_total       0
avq               0
article_count     0
revenue           0
brand             0
color             0
Abteilung         0
WHG               0
WUG               0
month             0
season            0
dtype: int64

In [5]:
data_raw.min()

article_id              900001
time_on_sale                 0
original_price               0
discount                   -30
markdown                   -10
sells_price                  0
stock_total                  0
avq                          0
article_count                0
revenue                      0
brand               Abiamarcae
color                    beige
Abteilung         Abteilung001
WHG                     WHG001
WUG                     WUG001
month                    April
season                    Fall
dtype: object

In [6]:
data_raw.max()

article_id                     908708
time_on_sale                       25
original_price                 179.95
discount                        81.98
markdown                           90
sells_price                    179.95
stock_total                     36396
avq                            100.12
article_count                    1093
revenue                       38577.9
brand             Þorgerðr Holgabrúðr
color                 weiss / schwarz
Abteilung                Abteilung007
WHG                            WHG043
WUG                            WUG152
month                       September
season                         Winter
dtype: object

In [7]:
# checking whether article_id unique count matches with min max values
np.count_nonzero(data_raw.article_id.unique())

8708

In [8]:
# describe() is pretty useless for transaction_date, markdown_start_date, markdown_end_date

In [9]:
# change float format for better readability, without it describe() shows x.xxxxxxxxxe+xx
pd.options.display.float_format = "{:.2f}".format
# the 50% of the describe output is the median
data_raw.original_price.describe()

count   226408.00
mean        50.52
std         24.63
min          0.00
25%         34.95
50%         49.95
75%         64.95
max        179.95
Name: original_price, dtype: float64

In [10]:
data_raw.sells_price.describe()

count   226408.00
mean        43.05
std         22.37
min          0.00
25%         28.48
50%         39.95
75%         58.45
max        179.95
Name: sells_price, dtype: float64

In [11]:
data_raw.discount.describe()

count   226408.00
mean         2.75
std          5.18
min        -30.00
25%          0.15
50%          0.80
75%          3.00
max         81.98
Name: discount, dtype: float64

In [12]:
data_raw.markdown.describe()

count   226408.00
mean         4.73
std          9.33
min        -10.00
25%          0.00
50%          0.00
75%          9.00
max         90.00
Name: markdown, dtype: float64

In [13]:
data_raw.article_count.describe()

count   226408.00
mean        42.57
std         55.23
min          0.00
25%          9.00
50%         23.00
75%         55.00
max       1093.00
Name: article_count, dtype: float64

In [14]:
data_raw.revenue.describe()

count   226408.00
mean      1585.38
std       1948.76
min          0.00
25%        391.48
50%        938.83
75%       2033.73
max      38577.86
Name: revenue, dtype: float64

In [15]:
# there are some really ridiculous outliers here
data_raw.stock_total.describe()

count   226408.00
mean      1893.77
std       1893.05
min          0.00
25%        684.00
50%       1412.00
75%       2688.00
max      36396.00
Name: stock_total, dtype: float64

In [16]:
# max value of this is above 100%, probably an accumulated rounding error, but still there
data_raw.avq.describe()

count   226408.00
mean        28.78
std         23.85
min          0.00
25%          7.56
50%         24.24
75%         45.44
max        100.12
Name: avq, dtype: float64

In [17]:
data_raw.time_on_sale.describe()

count   226408.00
mean        12.50
std          7.50
min          0.00
25%          6.00
50%         12.50
75%         19.00
max         25.00
Name: time_on_sale, dtype: float64

In [18]:
data_raw.season.unique()

[Spring, Winter, Summer, Fall]
Categories (4, object): [Spring, Winter, Summer, Fall]

In [19]:
data_raw.season.value_counts()

Winter    61750
Spring    61620
Summer    58500
Fall      44538
Name: season, dtype: int64

In [20]:
data_raw.brand.unique()

[Fimmilena, Gersimi, Loki, Turstuahenae, Börr, ..., Tamfana, Grusduahenae, Hludana, Axsinginehae, Surt]
Length: 75
Categories (75, object): [Fimmilena, Gersimi, Loki, Turstuahenae, ..., Grusduahenae, Hludana, Axsinginehae, Surt]

In [21]:
data_raw.brand.value_counts()

Friagabis                15392
Fimmilena                15340
Odin                     11934
Mercurius Arvernus       11674
Gautr                     9906
Freyr                     9880
Turstuahenae              8684
Mani                      8034
Gna                       7904
Travalaha                 7852
Heimdall                  6682
Alaisiagae                6682
Gersimi                   6578
Snotra                    6448
Tyr                       6006
Hercules Deusoniensis     5720
Lodur                     5278
Nersihenae                5148
Baudihillia               4914
Burorina                  4602
Tuisto                    4446
Beda                      4446
Loki                      4212
Hymir                     3640
Almaviahenae              3328
Kolga                     3328
Mercurius Hranno          3094
Alaferhviae               2600
Siofna                    2574
Uller                     2314
                         ...  
Abiamarcae                 780
Hermodr 

In [22]:
data_raw.color.unique()

[mittelbraun, weiss / blau, mittelgrau, mittelblau, pink, ..., multicolor, gelb, bordeauxrot, rost, ockergelb]
Length: 40
Categories (40, object): [mittelbraun, weiss / blau, mittelgrau, mittelblau, ..., gelb, bordeauxrot, rost, ockergelb]

In [23]:
data_raw.color.value_counts()

schwarz                 48308
dunkelblau              24310
mittelbraun             17238
schwarz / kombiniert    17004
mittelgrau              13936
dunkelbraun             13156
dunkelgrau              11648
mittelblau               9542
beige                    8918
hellgrau                 6786
rosa                     5148
pink                     4992
weiss / kombiniert       4472
weiss                    4212
bordeauxrot              3224
hellbraun                3016
camel                    2730
rot                      2392
beige / kombiniert       2366
silber                   2054
lila                     1976
hellblau                 1872
offwhite                 1872
türkis                   1820
multicolor               1742
gold                     1612
khaki                    1222
graublau                 1196
olivegrün                1170
gelb                      988
mintgrün                  884
orange                    858
hellgrün                  702
bronze/kup

In [24]:
sorted(data_raw.Abteilung.unique())

['Abteilung001',
 'Abteilung002',
 'Abteilung003',
 'Abteilung004',
 'Abteilung005',
 'Abteilung006',
 'Abteilung007']

In [25]:
data_raw.Abteilung.value_counts()

Abteilung002    100672
Abteilung006     38766
Abteilung005     37960
Abteilung007     36296
Abteilung004      6864
Abteilung001      5148
Abteilung003       702
Name: Abteilung, dtype: int64

In [26]:
sorted(data_raw.WHG.unique())

['WHG001',
 'WHG002',
 'WHG003',
 'WHG004',
 'WHG005',
 'WHG006',
 'WHG007',
 'WHG008',
 'WHG009',
 'WHG010',
 'WHG011',
 'WHG012',
 'WHG013',
 'WHG014',
 'WHG015',
 'WHG016',
 'WHG017',
 'WHG018',
 'WHG019',
 'WHG020',
 'WHG021',
 'WHG022',
 'WHG023',
 'WHG024',
 'WHG025',
 'WHG026',
 'WHG027',
 'WHG028',
 'WHG029',
 'WHG030',
 'WHG031',
 'WHG032',
 'WHG033',
 'WHG034',
 'WHG035',
 'WHG036',
 'WHG037',
 'WHG038',
 'WHG039',
 'WHG040',
 'WHG041',
 'WHG042',
 'WHG043']

In [27]:
data_raw.WHG.value_counts()

WHG015    24752
WHG021    20072
WHG012    15990
WHG007    15704
WHG041    13676
WHG034    13416
WHG010    12714
WHG009    12454
WHG042    12376
WHG043     8268
WHG006     8008
WHG022     7150
WHG038     6968
WHG023     5980
WHG032     5304
WHG035     5122
WHG005     3718
WHG008     3666
WHG036     3458
WHG028     3276
WHG001     2652
WHG026     2392
WHG014     1950
WHG037     1768
WHG025     1352
WHG019     1352
WHG020     1352
WHG002     1326
WHG039     1248
WHG033     1248
WHG003     1170
WHG027     1092
WHG013     1014
WHG031      832
WHG040      728
WHG030      650
WHG018      572
WHG017      494
WHG011      442
WHG004      260
WHG016      208
WHG024      130
WHG029      104
Name: WHG, dtype: int64

In [28]:
sorted(data_raw.WUG.unique())

['WUG001',
 'WUG002',
 'WUG003',
 'WUG004',
 'WUG005',
 'WUG006',
 'WUG007',
 'WUG008',
 'WUG009',
 'WUG010',
 'WUG011',
 'WUG012',
 'WUG013',
 'WUG014',
 'WUG015',
 'WUG016',
 'WUG017',
 'WUG018',
 'WUG019',
 'WUG020',
 'WUG021',
 'WUG022',
 'WUG023',
 'WUG024',
 'WUG025',
 'WUG026',
 'WUG027',
 'WUG028',
 'WUG029',
 'WUG030',
 'WUG031',
 'WUG032',
 'WUG033',
 'WUG034',
 'WUG035',
 'WUG036',
 'WUG037',
 'WUG038',
 'WUG039',
 'WUG040',
 'WUG041',
 'WUG042',
 'WUG043',
 'WUG044',
 'WUG045',
 'WUG046',
 'WUG047',
 'WUG048',
 'WUG049',
 'WUG050',
 'WUG051',
 'WUG052',
 'WUG053',
 'WUG054',
 'WUG055',
 'WUG056',
 'WUG057',
 'WUG058',
 'WUG059',
 'WUG060',
 'WUG061',
 'WUG062',
 'WUG063',
 'WUG064',
 'WUG065',
 'WUG066',
 'WUG067',
 'WUG068',
 'WUG069',
 'WUG070',
 'WUG071',
 'WUG072',
 'WUG073',
 'WUG074',
 'WUG075',
 'WUG076',
 'WUG077',
 'WUG078',
 'WUG079',
 'WUG080',
 'WUG081',
 'WUG082',
 'WUG083',
 'WUG084',
 'WUG085',
 'WUG086',
 'WUG087',
 'WUG088',
 'WUG089',
 'WUG090',
 'WUG091',

In [29]:
data_raw.WUG.value_counts()

WUG020    9958
WUG053    9932
WUG051    8632
WUG033    7176
WUG029    7176
WUG015    6942
WUG139    6630
WUG073    6058
WUG136    5980
WUG138    5746
WUG040    5668
WUG045    4992
WUG137    4732
WUG069    4706
WUG146    4628
WUG072    4108
WUG018    3744
WUG115    3588
WUG105    3510
WUG140    2964
WUG077    2808
WUG107    2756
WUG046    2626
WUG117    2626
WUG035    2548
WUG055    2522
WUG001    2496
WUG114    2392
WUG118    2236
WUG070    2210
          ... 
WUG006     208
WUG057     208
WUG060     182
WUG103     182
WUG002     156
WUG124     156
WUG087     130
WUG122     104
WUG008     104
WUG009     104
WUG101     104
WUG061     104
WUG091      78
WUG142      78
WUG065      78
WUG032      78
WUG044      78
WUG047      52
WUG007      52
WUG096      52
WUG024      52
WUG039      26
WUG098      26
WUG012      26
WUG123      26
WUG095      26
WUG028      26
WUG075      26
WUG133      26
WUG004      26
Name: WUG, Length: 152, dtype: int64