In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tailor

In [2]:
data_raw = tailor.load_data()

In [3]:
data_raw.head()

Unnamed: 0,article_id,season,brand,color,Abteilung,WHG,WUG,time_on_sale,original_price,sells_price,discount,markdown,article_count,stock_total,avq,revenue
0,900001,Sommer,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,0,59.95,58.883333,1.066667,0.0,3.666667,1499.0,0.722704,217.483333
1,900001,Sommer,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,1,59.95,59.755714,0.194286,0.0,5.571429,1499.0,3.097303,333.23
2,900001,Sommer,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,2,59.95,58.95,1.0,0.0,5.333333,1499.0,5.303536,315.233333
3,900001,Sommer,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,3,59.95,58.785,1.165,0.0,6.75,1499.0,7.204803,399.1725
4,900001,Sommer,Fimmilena,mittelbraun,Abteilung005,WHG021,WUG073,4,59.95,59.75,0.2,0.0,5.666667,1499.0,9.33956,337.716667


In [4]:
# checking whether there are any NaN cells
data_raw.isnull().sum()

article_id        0
season            0
brand             0
color             0
Abteilung         0
WHG               0
WUG               0
time_on_sale      0
original_price    0
sells_price       0
discount          0
markdown          0
article_count     0
stock_total       0
avq               0
revenue           0
dtype: int64

In [5]:
data_raw.min()

article_id              900001
season                  Sommer
brand               Abiamarcae
color                    beige
Abteilung         Abteilung001
WHG                     WHG001
WUG                     WUG001
time_on_sale                 0
original_price            9.95
sells_price             -32.01
discount                   -30
markdown                   -10
article_count                1
stock_total                106
avq                          0
revenue                 -32.01
dtype: object

In [6]:
data_raw.max()

article_id                     908708
season                         Winter
brand             Þorgerðr Holgabrúðr
color                 weiss / schwarz
Abteilung                Abteilung007
WHG                            WHG043
WUG                            WUG152
time_on_sale                       25
original_price                 179.95
sells_price                    179.95
discount                        88.95
markdown                           90
article_count                 156.143
stock_total                     36396
avq                           100.121
revenue                       5511.12
dtype: object

In [7]:
# checking whether article_id unique count matches with min max values
np.count_nonzero(data_raw.article_id.unique())

8708

In [8]:
# describe() is pretty useless for transaction_date, markdown_start_date, markdown_end_date

In [9]:
# change float format for better readability, without it describe() shows x.xxxxxxxxxe+xx
pd.options.display.float_format = "{:.2f}".format
# the 50% of the describe output is the median
data_raw.original_price.describe()

count   216580.00
mean        52.81
std         22.65
min          9.95
25%         39.95
50%         49.95
75%         69.95
max        179.95
Name: original_price, dtype: float64

In [10]:
data_raw.sells_price.describe()

count   216580.00
mean        45.17
std         20.89
min        -32.01
25%         29.48
50%         42.29
75%         59.03
max        179.95
Name: sells_price, dtype: float64

In [11]:
data_raw.discount.describe()

count   216580.00
mean         2.70
std          5.08
min        -30.00
25%          0.19
50%          0.84
75%          3.04
max         88.95
Name: discount, dtype: float64

In [12]:
data_raw.markdown.describe()

count   216580.00
mean         4.94
std          9.49
min        -10.00
25%          0.00
50%          0.00
75%         10.00
max         90.00
Name: markdown, dtype: float64

In [13]:
data_raw.article_count.describe()

count   216580.00
mean         7.49
std          8.58
min          1.00
25%          2.20
50%          4.40
75%          9.50
max        156.14
Name: article_count, dtype: float64

In [14]:
data_raw.revenue.describe()

count   216580.00
mean       282.86
std        299.06
min        -32.01
25%         99.92
50%        182.05
75%        350.95
max       5511.12
Name: revenue, dtype: float64

In [15]:
# there are some really ridiculous outliers here
data_raw.stock_total.describe()

count   216580.00
mean      1979.82
std       1891.05
min        106.00
25%        773.00
50%       1504.00
75%       2749.00
max      36396.00
Name: stock_total, dtype: float64

In [16]:
# max value of this is above 100%, probably an accumulated rounding error, but still there
data_raw.avq.describe()

count   216580.00
mean        30.09
std         23.56
min          0.00
25%          9.41
50%         25.91
75%         46.54
max        100.12
Name: avq, dtype: float64

In [17]:
data_raw.time_on_sale.describe()

count   216580.00
mean        12.17
std          7.38
min          0.00
25%          6.00
50%         12.00
75%         18.00
max         25.00
Name: time_on_sale, dtype: float64

In [18]:
data_raw.season.unique()

[Sommer, Winter]
Categories (2, object): [Sommer, Winter]

In [19]:
data_raw.season.value_counts()

Sommer    116633
Winter     99947
Name: season, dtype: int64

In [20]:
data_raw.brand.unique()

[Fimmilena, Gersimi, Loki, Turstuahenae, Börr, ..., Tamfana, Grusduahenae, Hludana, Axsinginehae, Surt]
Length: 75
Categories (75, object): [Fimmilena, Gersimi, Loki, Turstuahenae, ..., Grusduahenae, Hludana, Axsinginehae, Surt]

In [21]:
data_raw.brand.value_counts()

Fimmilena                14764
Friagabis                14496
Mercurius Arvernus       11150
Odin                     11045
Freyr                     9232
Gautr                     9171
Turstuahenae              8504
Mani                      7758
Gna                       7600
Travalaha                 7443
Alaisiagae                6597
Heimdall                  6463
Gersimi                   6393
Snotra                    6008
Tyr                       5897
Hercules Deusoniensis     5567
Lodur                     5160
Nersihenae                5130
Baudihillia               4780
Tuisto                    4396
Burorina                  4388
Beda                      4223
Loki                      4013
Hymir                     3547
Almaviahenae              3251
Kolga                     3188
Mercurius Hranno          2940
Alaferhviae               2533
Siofna                    2459
Uller                     2239
                         ...  
Verdandi                   728
Gebriniu

In [22]:
data_raw.color.unique()

[mittelbraun, weiss / blau, mittelgrau, mittelblau, pink, ..., multicolor, gelb, bordeauxrot, rost, ockergelb]
Length: 40
Categories (40, object): [mittelbraun, weiss / blau, mittelgrau, mittelblau, ..., gelb, bordeauxrot, rost, ockergelb]

In [23]:
data_raw.color.value_counts()

schwarz                 45986
dunkelblau              23546
schwarz / kombiniert    16375
mittelbraun             16294
mittelgrau              13459
dunkelbraun             12431
dunkelgrau              11221
mittelblau               9240
beige                    8629
hellgrau                 6597
rosa                     4877
pink                     4838
weiss / kombiniert       4280
weiss                    3908
bordeauxrot              3109
hellbraun                2803
camel                    2540
rot                      2317
beige / kombiniert       2198
lila                     1936
silber                   1888
hellblau                 1787
offwhite                 1783
türkis                   1753
multicolor               1653
gold                     1476
graublau                 1161
khaki                    1133
olivegrün                1110
gelb                      941
mintgrün                  866
orange                    824
hellgrün                  694
bronze/kup

In [24]:
sorted(data_raw.Abteilung.unique())

['Abteilung001',
 'Abteilung002',
 'Abteilung003',
 'Abteilung004',
 'Abteilung005',
 'Abteilung006',
 'Abteilung007']

In [25]:
data_raw.Abteilung.value_counts()

Abteilung002    95349
Abteilung006    37016
Abteilung005    36328
Abteilung007    35498
Abteilung004     6704
Abteilung001     4986
Abteilung003      699
Name: Abteilung, dtype: int64

In [26]:
sorted(data_raw.WHG.unique())

['WHG001',
 'WHG002',
 'WHG003',
 'WHG004',
 'WHG005',
 'WHG006',
 'WHG007',
 'WHG008',
 'WHG009',
 'WHG010',
 'WHG011',
 'WHG012',
 'WHG013',
 'WHG014',
 'WHG015',
 'WHG016',
 'WHG017',
 'WHG018',
 'WHG019',
 'WHG020',
 'WHG021',
 'WHG022',
 'WHG023',
 'WHG024',
 'WHG025',
 'WHG026',
 'WHG027',
 'WHG028',
 'WHG029',
 'WHG030',
 'WHG031',
 'WHG032',
 'WHG033',
 'WHG034',
 'WHG035',
 'WHG036',
 'WHG037',
 'WHG038',
 'WHG039',
 'WHG040',
 'WHG041',
 'WHG042',
 'WHG043']

In [27]:
data_raw.WHG.value_counts()

WHG015    23772
WHG021    19769
WHG012    15565
WHG007    13798
WHG041    13505
WHG034    13249
WHG009    12197
WHG010    12136
WHG042    11901
WHG043     8208
WHG006     7904
WHG038     6489
WHG022     6430
WHG023     5878
WHG035     5073
WHG032     4729
WHG008     3401
WHG005     3238
WHG028     3212
WHG036     3029
WHG001     2581
WHG026     2296
WHG014     1925
WHG037     1750
WHG002     1277
WHG033     1217
WHG020     1213
WHG025     1210
WHG019     1191
WHG039     1167
WHG003     1128
WHG027     1092
WHG031      830
WHG013      741
WHG040      717
WHG030      650
WHG018      548
WHG017      494
WHG011      435
WHG004      237
WHG016      205
WHG029      104
WHG024       89
Name: WHG, dtype: int64

In [28]:
sorted(data_raw.WUG.unique())

['WUG001',
 'WUG002',
 'WUG003',
 'WUG004',
 'WUG005',
 'WUG006',
 'WUG007',
 'WUG008',
 'WUG009',
 'WUG010',
 'WUG011',
 'WUG012',
 'WUG013',
 'WUG014',
 'WUG015',
 'WUG016',
 'WUG017',
 'WUG018',
 'WUG019',
 'WUG020',
 'WUG021',
 'WUG022',
 'WUG023',
 'WUG024',
 'WUG025',
 'WUG026',
 'WUG027',
 'WUG028',
 'WUG029',
 'WUG030',
 'WUG031',
 'WUG032',
 'WUG033',
 'WUG034',
 'WUG035',
 'WUG036',
 'WUG037',
 'WUG038',
 'WUG039',
 'WUG040',
 'WUG041',
 'WUG042',
 'WUG043',
 'WUG044',
 'WUG045',
 'WUG046',
 'WUG047',
 'WUG048',
 'WUG049',
 'WUG050',
 'WUG051',
 'WUG052',
 'WUG053',
 'WUG054',
 'WUG055',
 'WUG056',
 'WUG057',
 'WUG058',
 'WUG059',
 'WUG060',
 'WUG061',
 'WUG062',
 'WUG063',
 'WUG064',
 'WUG065',
 'WUG066',
 'WUG067',
 'WUG068',
 'WUG069',
 'WUG070',
 'WUG071',
 'WUG072',
 'WUG073',
 'WUG074',
 'WUG075',
 'WUG076',
 'WUG077',
 'WUG078',
 'WUG079',
 'WUG080',
 'WUG081',
 'WUG082',
 'WUG083',
 'WUG084',
 'WUG085',
 'WUG086',
 'WUG087',
 'WUG088',
 'WUG089',
 'WUG090',
 'WUG091',

In [29]:
data_raw.WUG.value_counts()

WUG053    9678
WUG020    8557
WUG051    8428
WUG033    7023
WUG029    7018
WUG015    6856
WUG139    6359
WUG073    5958
WUG136    5901
WUG040    5542
WUG138    5542
WUG045    4834
WUG137    4678
WUG069    4668
WUG146    4596
WUG072    4053
WUG115    3570
WUG105    3456
WUG018    3424
WUG140    2926
WUG107    2674
WUG117    2597
WUG046    2554
WUG077    2457
WUG001    2427
WUG114    2379
WUG055    2275
WUG035    2251
WUG118    2221
WUG070    2189
          ... 
WUG057     205
WUG011     189
WUG103     182
WUG060     158
WUG002     154
WUG124     145
WUG061     104
WUG009     104
WUG101     104
WUG087      89
WUG122      88
WUG008      81
WUG032      78
WUG142      78
WUG091      77
WUG065      62
WUG044      59
WUG007      52
WUG096      52
WUG024      48
WUG047      42
WUG095      26
WUG028      26
WUG098      26
WUG133      26
WUG004      26
WUG039      25
WUG012      23
WUG075      21
WUG123      13
Name: WUG, Length: 152, dtype: int64