# Time Series Data Analysis

In [2]:
# Import required libraries.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
# Reading data.
filepath = "../../../Data/future_sales/sales_train.csv"
stores_data = pd.read_csv(filepath, 
                          date_format="%d.%m.%Y",
                          parse_dates=["date"])

In [4]:
# Splitting data to avoid data snooping
split = StratifiedShuffleSplit(n_splits=1)
for train_index, test_index in split.split(stores_data, stores_data['shop_id']):
    stores_train_data = stores_data.loc[train_index,:]
    stores_test_data = stores_data.loc[test_index,:]

stores_train_data = stores_train_data.sort_values(by="date")
stores_test_data = stores_test_data.sort_values(by="date")

## Quick data exploration
- Want to know the columns and data types
- Basic statistics of the data

In [6]:
stores_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2642264 entries, 92624 to 2930887
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   date_block_num  int64         
 2   shop_id         int64         
 3   item_id         int64         
 4   item_price      float64       
 5   item_cnt_day    float64       
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 141.1 MB


In [7]:
stores_train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
92624,2013-01-01,0,42,10526,149.0,1.0
18524,2013-01-01,0,28,2415,299.0,1.0
111960,2013-01-01,0,42,3323,1989.0,2.0
105843,2013-01-01,0,37,9601,849.0,-1.0
108544,2013-01-01,0,37,15538,58.0,1.0


In [14]:
stores_train_data.describe()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2642264,2642264.0,2642264.0,2642264.0,2642264.0,2642264.0
mean,2014-04-03 06:35:47.750868224,14.57117,33.00172,10197.34,890.3925,1.24264
min,2013-01-01 00:00:00,0.0,0.0,0.0,-1.0,-22.0
25%,2013-08-01 00:00:00,7.0,22.0,4476.0,249.0,1.0
50%,2014-03-04 00:00:00,14.0,31.0,9343.0,399.0,1.0
75%,2014-12-05 00:00:00,23.0,47.0,15684.0,999.0,1.0
max,2015-10-31 00:00:00,33.0,59.0,22169.0,59200.0,2169.0
std,,9.423155,16.22698,6324.402,1719.623,2.632299


In [16]:
stores_train_data["item_cnt_day"].min()

-22.0