### Import useful modules

In [1]:
import pandas as pd # dataframes
import numpy as np # numerical library
import seaborn as sns # advanced plotting library
from matplotlib import pyplot as plt # plotting library
%matplotlib inline
# need this in order to display plots in notebook

### Load data

In [2]:
DATA_PATH = '/root/src/data/'

In [3]:
df_stores = pd.read_csv(DATA_PATH + 'stores.csv', dtype={'store_nbr': np.uint32, 'cluster': np.uint32})

### What does the data look like?

In [4]:
df_stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


### Nulls in data

In [5]:
print("Nulls in columns: {yolo1} => {yolo2}".format(yolo1=df_stores.columns.values, yolo2=df_stores.isnull().any().values))

Nulls in columns: ['store_nbr' 'city' 'state' 'type' 'cluster'] => [False False False False False]


In [6]:
def assess_missing_values(df):
    total = df.isnull().sum()
    percent = (df.isnull().sum()/df.isnull().count())
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [7]:
assess_missing_values(df_stores)

Unnamed: 0,Total,Percent
store_nbr,0,0.0
city,0,0.0
state,0,0.0
type,0,0.0
cluster,0,0.0


# Summary

In [8]:
print("There are",
      len(df_stores['store_nbr'].unique()),
      "stores in",
      len(df_stores['city'].unique()),
      "cities",
      "in",
      len(df_stores['state'].unique()),
      "states.")
print("The stores are categorised into",
      len(df_stores['type'].unique()),
      "types and",
      len(df_stores['cluster'].unique()),
      "clusters.")
print("Types:", np.sort(df_stores['type'].unique()))
print("Clusters:", np.sort(df_stores['cluster'].unique()))

There are 54 stores in 22 cities in 16 states.
The stores are categorised into 5 types and 17 clusters.
Types: ['A' 'B' 'C' 'D' 'E']
Clusters: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
