In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

%config IPCompleter.greedy=True

# Data

In [2]:
item_categories = pd.read_csv("item_categories.csv")
items = pd.read_csv("items.csv")
sales_train = pd.read_csv("sales_train.csv")
shops = pd.read_csv("shops.csv")
test = pd.read_csv("test.csv")


### Items

In [35]:
def get_descriptive_stats(df):
    print("SHAPE:")
    print(df.shape)
    print("\n\n")
    print("SAMPLE:")
    print(df.head(5))
    print("\n\n")
    print("#UNIQUE:")
    print(df.nunique())
    print("\n\n")
    print("MISSING VALUES:")
    print(df.isnull().sum())
    print("\n\n")
    print("DESCRIPTIVES:")
    print(df.describe())

In [36]:
get_descriptive_stats(items)

SHAPE:
(22170, 3)



SAMPLE:
                                           item_name  item_id  \
0          ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0   
1  !ABBYY FineReader 12 Professional Edition Full...        1   
2      ***В ЛУЧАХ СЛАВЫ   (UNV)                    D        2   
3    ***ГОЛУБАЯ ВОЛНА  (Univ)                      D        3   
4        ***КОРОБКА (СТЕКЛО)                       D        4   

   item_category_id  
0                40  
1                76  
2                40  
3                40  
4                40  



#UNIQUE:
item_name           22170
item_id             22170
item_category_id       84
dtype: int64



MISSING VALUES:
item_name           0
item_id             0
item_category_id    0
dtype: int64



DESCRIPTIVES:
           item_id  item_category_id
count  22170.00000      22170.000000
mean   11084.50000         46.290753
std     6400.07207         15.941486
min        0.00000          0.000000
25%     5542.25000         37.000000
50%    1

In [37]:
get_descriptive_stats(item_categories)

SHAPE:
(84, 2)



SAMPLE:
        item_category_name  item_category_id
0  PC - Гарнитуры/Наушники                 0
1         Аксессуары - PS2                 1
2         Аксессуары - PS3                 2
3         Аксессуары - PS4                 3
4         Аксессуары - PSP                 4



#UNIQUE:
item_category_name    84
item_category_id      84
dtype: int64



MISSING VALUES:
item_category_name    0
item_category_id      0
dtype: int64



DESCRIPTIVES:
       item_category_id
count         84.000000
mean          41.500000
std           24.392622
min            0.000000
25%           20.750000
50%           41.500000
75%           62.250000
max           83.000000


In [38]:
get_descriptive_stats(shops)

SHAPE:
(60, 2)



SAMPLE:
                        shop_name  shop_id
0   !Якутск Орджоникидзе, 56 фран        0
1   !Якутск ТЦ "Центральный" фран        1
2                Адыгея ТЦ "Мега"        2
3  Балашиха ТРК "Октябрь-Киномир"        3
4        Волжский ТЦ "Волга Молл"        4



#UNIQUE:
shop_name    60
shop_id      60
dtype: int64



MISSING VALUES:
shop_name    0
shop_id      0
dtype: int64



DESCRIPTIVES:
         shop_id
count  60.000000
mean   29.500000
std    17.464249
min     0.000000
25%    14.750000
50%    29.500000
75%    44.250000
max    59.000000


In [39]:
get_descriptive_stats(sales_train)

SHAPE:
(2935849, 8)



SAMPLE:
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day  \
0  02.01.2013               0       59    22154      999.00           1.0   
1  03.01.2013               0       25     2552      899.00           1.0   
2  05.01.2013               0       25     2552      899.00          -1.0   
3  06.01.2013               0       25     2554     1709.05           1.0   
4  15.01.2013               0       25     2555     1099.00           1.0   

  month  year  
0    01  2013  
1    01  2013  
2    01  2013  
3    01  2013  
4    01  2013  



#UNIQUE:
date               1034
date_block_num       34
shop_id              60
item_id           21807
item_price        19993
item_cnt_day        198
month                12
year                  3
dtype: int64



MISSING VALUES:
date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
month             0
year              0
dtype: int64



# Simple baseline

In [23]:
# Simple baseline 1: compute average sales per shop+item in November 2013/2014 and average over the 2 years.
sales_train['month'] = sales_train['date'].apply(lambda x: x.split('.')[-2])
sales_train['year'] = sales_train['date'].apply(lambda x: x.split('.')[-1])
november_only = sales_train[sales_train.month=='11']

avg_grouped = november_only.groupby(['shop_id','item_id']).agg({'item_cnt_day':'sum'})
avg_grouped['item_cnt_day'] /= 2. # number of novembers in training set

merged_with_test = test.merge(avg_grouped,on=['shop_id','item_id'],how='left').fillna(0) # if no sales predict 0

submission_file = merged_with_test.reset_index()[['ID','item_cnt_day']]
submission_file = submission_file.rename(mapper={'item_cnt_day':'item_cnt_month'},axis=1)
submission_file.to_csv("submissions/november_average.csv",index=False)

In [24]:
# Simple baseline 2: compute average sales per shop+item in each month of the training period and predict the average.
all_sales = sales_train.copy()

avg_grouped = all_sales.groupby(['shop_id','item_id']).agg({'item_cnt_day':'sum'})
avg_grouped['item_cnt_day'] /= 34. # number of months in training set

merged_with_test = test.merge(avg_grouped,on=['shop_id','item_id'],how='left').fillna(0) # if no sales predict 0

submission_file = merged_with_test.reset_index()[['ID','item_cnt_day']]
submission_file = submission_file.rename(mapper={'item_cnt_day':'item_cnt_month'},axis=1)
submission_file.to_csv("submissions/monthly_average.csv",index=False)