In [None]:
# Last amended: 10th May, 2020
# objectives:
#           i)  Learning to draw various types of graphs
#          ii)  Conditional plots using catplot
#         iii)  Relationship plots using relplot
#          ii)  Learning to use plotly express
# Good references:
#            API: https://plot.ly/python-api-reference/plotly.express.html
#                  https://plotly.com/python-api-reference/plotly.express.html
#        Gallery: https://nbviewer.jupyter.org/github/plotly/plotly_express/blob/gh-pages/gallery.ipynb
#    Walkthrough: https://plot.ly/python/plotly-express/#plotly-express

In [None]:
# 1.0 To which python, your jupyter notebook pointing to?
#     If not to correct python, then install: conda install nb_install
#     This gives you option to select correct kernel (under Kernel-->Change Kernel)
import sys
sys.executable

In [None]:
# 1.1 Call libraries
%reset -f
# 1.2 For data manipulations
import numpy as np
import pandas as pd
# 1.3 For plotting
import matplotlib.pyplot as plt
import matplotlib
# Install as: conda install -c plotly plotly 
import plotly.express as px
# 1.4 For data processing
from sklearn.preprocessing import StandardScaler
# 1.5 OS related
import os


In [None]:
# 1.6 Display output not only of last command but all commands in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 1.7 Set pandas options to display results
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [None]:
# 2.0 Go to folder containing data file
os.chdir("D:\\data\\OneDrive\\Documents\\advertising")
#os.chdir("/home/ashok/datasets/advertising")
os.listdir()            # List all files in the folder

# 2.1 Read file and while reading file,
#      convert 'Timestamp' to datetime time
ad = pd.read_csv("advertising.zip",
                  parse_dates = ['Timestamp']    # especial for date parsing
                  )
ad.head()

In [None]:
# 2.2 Check data types of attributes
ad.dtypes

# 2.3 Some more dataset related information
ad.info()               # Also informs how much memory dataset takes
                        #   and status of nulls

ad.memory_usage()

# 2.4.1
ad.shape                # (1000, 10)
ad.columns.values
len(ad.columns)         # 10 attributes

# 2.5 Categorical data value counts
#     Or number of levels per category
len(ad.City.unique())                   # 969 cities out of 1000
ad.City.value_counts()

# 2.6 How many conutries
len(ad.Country.unique())                # 237 countries
ad.Country.value_counts()               # Mostly 2 per country

# 2.7 Distribution of gender
ad.Male.value_counts()                  # 519:481

# 2.8 Distribution of clicks
ad['Clicked on Ad'].value_counts()      # 1 and 0 in the ratio of 500:500
                                        # This is highly optimistic. Genrally clicks may be 1%


In [None]:
#############################
# 3.0 Create features
#############################
# 3.1 Descretise continuos columns
#     These are equal width bins as against
#     equal data-points bins (quantile) or kmeans clusters
#     Alternatively use KBinsDiscretizer of sklearn
ad["age_cat"] = pd.cut(
                       ad['Age'],
                       bins = 3,
                       labels= ["y", "m", "s"]
                      )

ad["area_income_cat"] = pd.cut(
                               ad['Area Income'],
                               bins = 3,
                               labels= ["l", "m", "h"]
                               )


In [None]:
# 3.2 Create a new column as per length of each ad-line
ad['AdTopicLineLength'] = ad['Ad Topic Line'].apply(lambda x : len(x))

# 3.3 Create a new column as per number of words in each ad-line
# Try "good boy".split(" ")  and len("good boy.split(" "))
"good boy".split(" ")             # ['good', 'boy']
len("good boy".split(" "))        # 2


# 3.3.1 Note the use of apply(). This apply() works on complete Series
#       to transform it rather than to summarise it as in groupby.
ad['AdTopicNoOfWords'] = ad['Ad Topic Line'].apply(lambda x : len(x.split(" ")))   # Note the use of apply()
                                                                                   # This apply works on complete Series



In [None]:
# 3.4 A column that has counts of City and
#       another column with counts of Country column
#       Note the use of transform method here
grouped = ad.groupby(['City'])
ad['City_count'] = grouped['City'].transform('count')   # count is a groupby method

# 3.4.1 Same way for country
grouped = ad.groupby(['Country'])
ad['Country_count'] = grouped['Country'].transform('count')   # count is a groupby method


# 3.5 Extract date components using Series.dt accessor
#     https://pandas.pydata.org/pandas-docs/stable/reference/series.html#api-series-dt
#     https://pandas.pydata.org/pandas-docs/stable/reference/series.html#datetime-properties

# 3.6 What is the type of 'dt'
type(ad['Timestamp'].dt)    # Accessor like get()
                            # pandas.core.indexes.accessors.DatetimeProperties

# 3.7 Extract hour, weekday and month
ad['hourOfDay']    = ad['Timestamp'].dt.hour
ad['weekday'] = ad['Timestamp'].dt.weekday
ad['quarter'] = ad['Timestamp'].dt.month      # First we get month. Then we map month to quarter
                                              #   See below

# 3.8 Cut hour to morning, evening, night etc
#     For example 0 to 6am is earlymorning
l = ["earlymorning", "morning", "afternoon", "evening", "night","latenight"]
ad["hour"] = pd.cut(ad['hourOfDay'], bins = [-1,6,12,17,20,22,24], labels = l)


# 4.0 Similarly for weekdays
#     Map weekday numbers to weekday names
#     We use Series.map() method
ad['weekday'] = ad['weekday'].map({
                                    0 : 'Monday',
                                    1 : 'Tuesday',
                                    2: 'Wednesday',
                                    3: 'Thursday',
                                    4: 'Friday',
                                    5: 'Saturday',
                                    6: 'Sunday'
                                    }
                                )

ad['weekday'].head()


In [None]:
# 5.0 We use Series.map() method again but this time instead of supplying
#      a dictionary to dictate transformation, we use a function for
#        transformation
def month(x):
    if 0 < x < 3:
        return "Q1"            # Quarter 1
    if 3<= x < 6:
        return "Q2"            # Quarter 2
    if 6 <= x < 9:
        return "Q3"            # Quarter 3
    if 9 <= x < 12:
        return "Q4"            # Quarter 4

ad['quarter'] = ad['quarter'].map(lambda x : month(x))   # Which quarter clicked

ad['quarter'].head()

# 5.1 So finally what are col names?
ad.columns.values
ad.shape               # (1000, 19)  Earlier shape was (1000, 10)

# 5.2 Let us rename some columns; remove spaces

new_col_names  = {
                 'Daily Time Spent on Site' :  'DailyTimeSpentonSite',
                 'Area Income'              : 'AreaIncome',
                 'Daily Internet Usage'     : 'DailyInternetUsage',
                 'Clicked on Ad'            : 'Clicked_on_ad',
                 'Male'                     : 'Gender'
              }
# 5.2.1
ad.rename(
         new_col_names,
         inplace = True,
         axis = 1             # Note the axis keyword. By default it is axis = 0
         )

ad.head()
ad.columns.values


In [None]:
##################
# 5 Plotting
##################

# Question 1: How is Age distributed?
# Question 2: How is DailyTimeSpentonSite distributed
# Question 3: How is AreaIncome distributed

# 5.1 Distribution of each continuous value using px.histogram()

# 5.1.1 Age is slight skewed to right. Naturally density of younger
#       persons is high
#       Ref: Examples: https://plotly.com/python/histograms/
#            API:   https://plotly.github.io/plotly.py-docs/generated/plotly.express.histogram.html

#fig = plt.figure()  # Not applicable for plotly express
                     #  Does not use matplotlib as backend
#plt.subplot(1,2,1)
px.histogram(data_frame =ad, x ='Age')
#plt.subplot(1,2,2)


In [None]:
# 5.1.1 Use a style-template
px.histogram(data_frame =ad,
                    x ='Age',
                   nbins =20,
                   template="plotly_dark", # Available themes: ["plotly", "plotly_white", "plotly_dark",
                                           #     "ggplot2", "seaborn", "simple_white", "none"]
                                           # https://plotly.com/python/templates/
                   #width = 10    # in inches in interval [10, inf]
                   #height = 10    # in interval [10,inf]
            )


In [None]:
# 5.1.2 Show boxplot in the margins
px.histogram(data_frame =ad,
             x = 'Age',
             marginal = 'box'    # 'rug', 'box', 'violin', or 'histogram'
             )

In [None]:
# 5.1.3 Ht of histogram depends upon another feature
px.histogram(data_frame = ad,
                     x  = "Age",
                     y  = "DailyInternetUsage",  # Ht to be decided as per histfunc()
                     histfunc = "avg"   # One of 'count', 'sum', 'avg', 'min', or 'max'
             )

In [None]:
# 5.1.4 Conditional histogram--Conditioned upon target column
px.histogram(data_frame =ad,
                      x = 'Age',
               marginal = 'violin',
                  color = 'Clicked_on_ad',    # Conditional column. In seaborn it is hue
                opacity = 0.2
             )

In [None]:
#5.1.5 Conditional histogram--Conditioned upon target column
#        and further subdivided by gender
px.histogram(data_frame =ad,
             x = 'Age',
             color = 'Clicked_on_ad',
             opacity = 0.2,            # In matplotlib it is alpha        
             facet_row = 'Gender'      # In seaborn it is simply rows and cols
             )

In [None]:
# 5.1.6 Conditional histogram--Conditioned upon target column
#   and further subdivided by gender and also by Quarter
px.histogram(data_frame =ad,
             x = 'Age',
             color = 'Clicked_on_ad',
             opacity = 0.2,
             facet_row = 'Gender',    
             facet_col = 'quarter',
             labels = {'Age' : "age of persons"} # Replace X-axis label
             )

In [None]:
# 5.1.7 Conditional histogram--Conditioned upon target column
#        and further subdivided by gender and also by Quarter
#         Change number of bins
px.histogram(data_frame =ad,
             x = 'Age',
             color = 'Clicked_on_ad',
             opacity = 0.2,
             facet_row = 'Gender',
             facet_col = 'quarter',
             labels = {'Age' : "age of persons"},
             nbins = 20
             )

In [None]:
# Question 4: Show joint distribution of DailyTimeSpentonSite and AreaIncome
# Question 5: Show joint distribution of DailyInternetUsage and DailyTimeSpentonSite
# Question 6: Show these plots as kernel density as also 'hex' as also
#             draw regression line
#
# A jointplot = Scatterplot + Density plots


In [None]:
# 6.0 Innermost contour will show most counts
#     Examples: https://plotly.com/python/2d-histogram-contour/
#     API:      https://plotly.com/python-api-reference/generated/plotly.express.density_contour.html
px.density_contour(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   )

In [None]:
# 6.1 Filling density contours
#     Every plotting command returns a figure.
#     It can be used subsequently to modify plot properties.
#     This is unlike matplotlib where an axes is returned.

fig = px.density_contour(
                         data_frame =ad,
                         x = 'DailyTimeSpentonSite',
                         y = 'AreaIncome',
                        )
fig.update_traces(
                  contours_coloring="fill",
                  contours_showlabels = True
                 )
#fig.show()     # At times needed. For matplotlib, corresponding command is plt.show()

In [None]:
# 6.2 Using an aggregation function, other than 'count'.
#     Aggregation function other than 'count' can only be applied on z-axis
#     Group by x and y, and show average(z).
fig = px.density_contour(
                         data_frame =ad,
                         x = 'DailyTimeSpentonSite',
                         y = 'AreaIncome',
                         z = 'DailyInternetUsage',
                         histfunc = 'avg'
                        )
fig.update_traces(
                  contours_coloring="fill",
                  contours_showlabels = True
                 )

In [None]:
# 6.3 Conditional contour plotting.
#     Unfortunately plotly express cannot fill it
px.density_contour(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   color = 'Gender'
                   )

In [None]:
# 6.4 Conditional contour plotting as also marginal plots
px.density_contour(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   color = 'Gender',
                   marginal_x = 'box',
                   marginal_y = 'rug'
                   )

In [None]:
# 6.5 Changing width/height
px.density_contour(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   color = 'Gender',
                   marginal_x = 'box',
                   marginal_y = 'rug',
                   width = 600,        # Deault 600 pixels
                   height = 600,
                   title = "Contour plots"
                   )

In [None]:
# 6.6 Facetting by category
fig = px.density_contour(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   facet_col= 'Gender'
                   )
fig.update_traces(
                  contours_coloring="fill",
                  contours_showlabels = True
                 )


In [None]:
# 6.7 Facetting by category, column wise as also row-wise
fig = px.density_contour(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   facet_col= 'Gender',
                   facet_row = 'Clicked_on_ad'
                   )

fig.update_traces(
                  contours_coloring="fill",
                  contours_showlabels = True
                 )


In [None]:
# 7.0 Conditional Violin plots
#     Ref:   https://plotly.com/python/violin/
#        A violin plot is a method of plotting numeric data. 
#        It is similar to a box plot, with the addition of a
#        rotated kernel density plot on each side. It is particularly
#        useful when the data distribution is multimodal (more than 
#        one peak). In this case a violin plot shows the presence of
#        different peaks, their position and relative amplitude. 

fig = px.violin(ad,
                y="DailyInternetUsage",
                x="Clicked_on_ad",
                color="Gender",      
                box=True,
                points="all",
                hover_data=ad.columns
               )
fig.show()

In [None]:
# 7.0  Heatmap. Intensity of colour indicates counts
#      Note that both X and Y axis are first binned
#      Ref: https://plotly.com/python-api-reference/generated/plotly.express.density_heatmap.html#plotly.express.density_heatmap

px.density_heatmap(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome'
                   )

In [None]:
# 7.0  Heatmap. Intensity of colour indicates counts
#      Note that both X and Y axis are first binned
#      Ref: https://plotly.com/python-api-reference/generated/plotly.express.density_heatmap.html#plotly.express.density_heatmap

px.density_heatmap(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   nbinsx = 20,             # Bins along X and Y axis
                   nbinsy = 30
                   )

In [None]:
# 7.0  Heatmap. Intensity of colour indicates average of z
#      Ref: https://plotly.com/python-api-reference/generated/plotly.express.density_heatmap.html#plotly.express.density_heatmap
#      Inbuilt color_continuous_scale: https://plotly.com/python/builtin-colorscales/

px.density_heatmap(
                   data_frame =ad,
                   x = 'DailyTimeSpentonSite',
                   y = 'AreaIncome',
                   z = 'DailyInternetUsage',  # histfunc() of this is intensity of colour
                   histfunc = 'avg',
                   color_continuous_scale = 'icefire'  # Diverging color scale
    
                   )

In [None]:
# 8.0 Bar charts
# 8.1
df = pd.DataFrame({'name' : ['a', 'b', 'b'] * 20, 'value1' : [1.5, 2.5, 3.5] *20, 'yesno' : [0, 1, 1, 0] * 15 } )
df.head()
# 8.2
hx = df.groupby('name')['value1'].sum()
hx = hx.reset_index()
hx

In [None]:
# 8.3 Simplest bar chart from grouped data
px.bar(data_frame = hx,
                x = 'name',
                y = 'value1'
      )
# 8.4 Barchart from dataframe
px.bar(data_frame = df,
                x = 'name',
                y = 'value1')

# 8.5 Stacked bar chart (or sort of conditional bar chart).
#     Note use of 'histogram'
px.histogram(data_frame = df,
                      x = 'name',
                      y = 'value1',
                  color = 'yesno',
               histfunc = 'count'
            )

In [None]:
##################### More To be done ####################
# 9.0 Bar chart--Univariate
#     https://plotly.com/python/bar-charts/
#     Ref: https://plotly.com/python-api-reference/generated/plotly.express.bar.html
px.bar(data_frame=ad,
                x='age_cat',
                y = 'DailyInternetUsage',
                template='plotly_white'

      )

In [None]:
# Stacked bar chart
px.histogram(data_frame=ad
             ,x="age_cat"
             ,y="area_income_cat"
             ,color="Clicked_on_ad"
             ,histfunc="count"
             ,labels={'age_cat':'Characters'}
             ,template='plotly_white'
            )

In [None]:
###########################################################