In [1]:
import pandas as pd
import os

from src.settings import RAW_PATH

In [2]:
raw_files = os.listdir(RAW_PATH)
raw_files

['sales_train.csv',
 'shops.csv',
 'test.csv',
 'item_categories.csv',
 'items.csv',
 'sample_submission.csv']

Clearly, not all of the files provided are useful for training. In particular, "test", "sample_submission" are admin, so will remove from the dict.

In [3]:
for item in ['test.csv', 'sample_submission.csv']:
    try:
        raw_files.remove(raw_files.index(item))
    except ValueError:
        print(f"{item} not found in list")

test.csv not found in list
sample_submission.csv not found in list


For convinience let's add all raw files to one dict

In [6]:
raw_files_di = {}
for f in raw_files:
    path = RAW_PATH + f
    raw_files_di[f.split('.')[0]] = pd.read_csv(path)
    

In [7]:
# reading files and looking at headers
for el in raw_files_di:
    print(el,':\n', raw_files_di[el].head())
    print('='*30, '\n\n')

sales_train :
          date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0


shops :
                         shop_name  shop_id
0   !Якутск Орджоникидзе, 56 фран        0
1   !Якутск ТЦ "Центральный" фран        1
2                Адыгея ТЦ "Мега"        2
3  Балашиха ТРК "Октябрь-Киномир"        3
4        Волжский ТЦ "Волга Молл"        4


test :
    ID  shop_id  item_id
0   0        5     5037
1   1        5     5320
2   2        5     5233
3   3        5     5232
4   4        5     5268


item_categories :
         item_category_name  item_category_id
0  PC - Гарнитуры/Наушники                 0
1      

All the columns are clear except for `date_block_num` in the `sales_train` file, let's look at it closer.

In [65]:
raw_files_di['sales_train']['date_block_num'].value_counts()

date_block_num
11    143246
23    130786
2     121347
0     115690
1     108613
7     104772
6     100548
5     100403
12     99349
10     96736
8      96137
9      94202
3      94109
14     92733
4      91759
13     89830
24     88522
19     86614
22     86428
17     82408
21     79361
18     78760
16     78529
15     77906
20     73157
25     71808
26     69977
31     57029
27     56274
30     55549
29     54617
28     54548
33     53514
32     50588
Name: count, dtype: int64

In [75]:
raw_files_di['sales_train'][raw_files_di['sales_train']['date_block_num'] == 10]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
1027580,06.11.2013,10,43,4420,299.0,1.0
1027581,16.11.2013,10,28,10033,199.0,1.0
1027582,14.11.2013,10,28,10051,149.0,1.0
1027583,22.11.2013,10,28,10051,149.0,1.0
1027584,30.11.2013,10,28,10051,149.0,1.0
...,...,...,...,...,...,...
1124311,27.11.2013,10,50,3734,2599.0,1.0
1124312,29.11.2013,10,50,3734,2599.0,1.0
1124313,09.11.2013,10,50,3743,799.0,1.0
1124314,16.11.2013,10,50,3743,799.0,1.0


Some observations on data structure:

- `shops` -- a mapping of shops names to an ordinal categorical feature shop_id
- `item_categories` -- list of items' categories
- `items` -- unique list of items in stock, already merged with item_categories
- `sales_train` -- main table for training with IDs of shops, items and sales date. Additional columns are item price and date_block_num. 
    - `date_block_num` is grouping by data, i.e. each value marks observations recorded within 1 month only. 
    - `price` column is self explanatory.

So it makes sense to add the items grouping ti the sales_train tale by joining the items table and explore the data in the merged table afterwards. Let's do that.
