In [14]:
import pandas as pd
#import plotly.plotly as py
from chart_studio.plotly import plot, iplot
import cufflinks as cf
from ipywidgets import interact
import datetime as dt
import calendar
from geopy.geocoders import Nominatim
%matplotlib inline
from geopandas import GeoDataFrame
import folium
from folium.plugins import HeatMap

cf.go_offline()

ImportError: 
The plotly.plotly module is deprecated,
please install the chart-studio package and use the
chart_studio.plotly module instead. 


There are various steps in this book:
    1. Extract the sales by store_type
    2. Plot the segmentation to see which sales channels are more effective
    3. Extract the weather in each city
    4. Combine both dataframes
    5. Evaluate correlations & plot different graphs to get the insights

# 1. Sales extraction

This extraction contains the sales of a Retailer from March & May 2019, divided by channel

In [10]:
sales = pd.read_csv('/Users/juanfernandez-dazadeflorez/Desktop/Ironhack/labs/data-labs/module-2/visualizing-real-world-data-project/sales_by_type.csv', sep=';')
sales.head()

Unnamed: 0,Store's Types,City,DATE,UnitsSold
0,ECI stores,LA CORUÑA,01/03/2019,2
1,ECI stores,LA CORUÑA,02/03/2019,81
2,ECI stores,LA CORUÑA,04/03/2019,42
3,ECI stores,LA CORUÑA,07/03/2019,57
4,ECI stores,LA CORUÑA,08/03/2019,33


Lets be consistent with the column names, starting with capital letters

In [11]:
sales.columns = ['StoreType','City','Date','UnitsSold']

# 2. Plotting the segmentation to get the first conclusions

In [12]:
sales.pivot_table(index = 'StoreType', values = 'UnitsSold',aggfunc = 'sum').reset_index().iplot(kind = 'bar',
                                                                  x = 'StoreType',
                                                                  y = 'UnitsSold',
                                                                  xTitle = 'Channel',
                                                                  yTitle = 'Units Sold',
                                                                  title = 'Sales by channel',
                                                                  color = 'blue',
                                                                  opacity = 0.4)

AttributeError: 'DataFrame' object has no attribute 'iplot'

We can cleary see that during these two months, the best sales channel by far has been stores in located in Shopping Centers. Its true that the online store includes one unique store, whilst the rest aggregate various stores

Lets evaluate the trend of sales of each channel during the whole period

In [13]:
sales.pivot_table(index = 'Date',columns = 'StoreType', values = 'UnitsSold', aggfunc ='sum').reset_index().iplot(kind = 'line',
            x = 'Date',
            xTitle = 'Channel',
            yTitle = 'Units Sold',
            title = 'Sales Trend by channel')

AttributeError: 'DataFrame' object has no attribute 'iplot'

Now, lets include a column with the week day, to understand which is the day of the week with more sales

In [None]:
sales['Date'] = sales['Date'].apply(lambda x: pd.to_datetime(x,format='%Y-%m-%d'))

In [None]:
sales.dtypes

In [None]:
sales['DayOfWeek'] = sales['Date'].dt.weekday_name
sales.head()

Lets create a pivot to get the sales although we want percentages, so we need to normalize first

In [None]:
pivot_days = sales.pivot_table(index = 'DayOfWeek',
                  columns = 'StoreType', 
                  values = 'UnitsSold', 
                  aggfunc ='sum').reset_index()
pivot_days

In [None]:
pivot_days.dtypes

In [None]:
total_ECIStore = pivot_days['ECI stores'].sum()
pivot_days['% ECI stores'] = pivot_days['ECI stores'].apply(lambda x: x/total_ECIStore)
total_StreetStore = pivot_days['Street stores'].sum()
pivot_days['% Street stores'] = pivot_days['Street stores'].apply(lambda x: x/total_StreetStore)
total_OnlineStore = pivot_days['Online stores'].sum()
pivot_days['% Online stores'] = pivot_days['Online stores'].apply(lambda x: x/total_OnlineStore)
total_ShoppingCenterStore = pivot_days['Shopping Center stores'].sum()
pivot_days['% Shopping Center stores'] = pivot_days['Shopping Center stores'].apply(lambda x: x/total_ShoppingCenterStore)

In [None]:
pivot_days

In [None]:
# We need to re-index they days of the week because by default they're order alphabetically
pivot_days = pivot_days.drop(['ECI stores', 'Online stores', 'Shopping Center stores',
       'Street stores'], axis = 1).reindex([1,5,6,4, 0, 2,3])

In [None]:
pivot_days.iplot(kind = 'line',
            x = 'DayOfWeek',
            xTitle = 'Channel',
            yTitle = 'Units Sold',
            title = 'Sales Trend by channel by day of week')

Its quite interesting to see that on Sundays, contrary to all the physical stores, the online store increases the Units Sold. Also we have to know that this data could be a bit biased since it only contains data from one and a half months

Now, lets include the coordinates of each city so that we can create a heatmap

In [None]:
# Firstly, lets get the list of cities
cities = list(sales['City'].unique())

With geocode, the function gives latitude and longitude for each city

In [None]:
for city in cities:
    geolocator = Nominatim()
    location = geolocator.geocode(city)
    sales.loc[(sales['City']==city),'Latitude']=location.latitude
    sales.loc[(sales['City']==city),'Longitude']=location.longitude

In [None]:
sales.head()

Next step is to group the data for each lat & long, to use it in a heatmap

In [None]:
pivot_map = sales.pivot_table(index = ('Latitude','Longitude'),values = 'UnitsSold', aggfunc ='sum').reset_index()
pivot_map.head()

A function is created to have a default map centered in Spain

In [None]:
def generateBaseMap(default_location=[40.41, -3.7], default_zoom_start=6.45):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [None]:
base_map = generateBaseMap()
HeatMap(pivot_map.values.tolist(),radius=12,max_zoom=12).add_to(base_map)
base_map

The area with the greater sales is Madrid, followed by the Valencian Commnunity thanks to the sales in Alicante, Valenncia & Castellon.
Finally, Asturias also shows great performance

# 3. Extract the weather in each city included in the analysis, for each of the dates