In [1]:
# libabries for data exploration.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from helpful_functions import MLFunctions
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

In [2]:
# load sales data.
sales = pd.read_csv("../../../Data/future_sales/sales_train.csv")

In [3]:
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
sales.tail()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2935844,10.10.2015,33,25,7409,299.0,1.0
2935845,09.10.2015,33,25,7460,299.0,1.0
2935846,14.10.2015,33,25,7459,349.0,1.0
2935847,22.10.2015,33,25,7440,299.0,1.0
2935848,03.10.2015,33,25,7460,299.0,1.0


In [5]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


In [6]:
sales.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [7]:
# use MLfunction class to convert date to equivalent numeric values.
user_function = MLFunctions(sales)
date_num = user_function.dateNum()

In [8]:
# create new column with numeric values equivalent to dates. 
sales["date_num"] = date_num

In [9]:
train_set, test_set = train_test_split(sales, test_size=0.2, random_state=44)

In [10]:
exploration_data = train_set.copy()
exploration_data["total_sales"] = exploration_data["item_price"] * exploration_data["item_cnt_day"]

In [None]:
attributes = ["date_block_num", "shop_id", "item_id", "item_price", "item_cnt_day", "date_num"]
scatter_matrix(exploration_data[attributes], figsize=(12, 8))
plt.show()

In [None]:
total_sales_month = exploration_data.groupby(by="date_block_num")["total_sales"].sum()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax.plot(total_sales_month)
ax.set_title("Sales over the Years")
ax.set_xlabel("Year")
ax.set_ylabel("Total sales")
ax.grid()
plt.show()

In [None]:
total_items_month = exploration_data.groupby(by="date_block_num")["item_cnt_day"].sum()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax.plot(total_items_month)
ax.set_title("Total items sold over Years")
ax.set_xlabel("Year")
ax.set_ylabel("Total items")
ax.grid()
plt.show()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax.plot(np.log(total_sales_month), label="Sales made")
ax.plot(np.log(total_items_month), label="Items sold")
ax.set_title("Total items sold and Sales per shop over Years")
ax.set_xlabel("Year")
ax.set_ylabel("Count")
ax.legend()
ax.grid()
plt.show()

In [None]:
total_shops_month = exploration_data.groupby(by="shop_id")["total_sales"].sum()

In [None]:
total_shops_month.plot(kind="bar",
                       title="Sales per Shop",
                       xlabel="Shops",
                       ylabel="Total sales",
                       figsize=(15, 10),
                       grid=True
                      )
plt.show()