## Data Visualization using Plotly

## 1. Introduction to Plotly

<p>

    1. Until now we did visualisations using Matplotlib, Seaborn and Pandas. All of them produce
    static image files.<br><br>
    2. Plotly is company based out in Canada famous for it's products like Plotly and Dash<br><br>
    3. Plotly creates interactive visualisations in the form of HTML files<br><br>
    4. Drawback- can't work with a live data source<br><br>
</p>

#### Datasets
<p>

    1. Using IPL data (matches) and (deliveries)

    2. Importing the libraries
</p>

In [4]:
import numpy as np
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.express as px

Load the both datasets

In [5]:
matches = pd.read_csv('/content/matches (1).csv')
delivery = pd.read_csv('/content/deliveries (2).csv')

In [6]:
matches.head(2)

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,


In [7]:
delivery.head(2)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


Merging the both datasets

In [8]:
ipl = delivery.merge(matches, left_on='match_id', right_on='id')
ipl.head(2)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,


In [9]:
ipl.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder', 'id', 'season', 'city', 'date', 'team1',
       'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied',
       'winner', 'win_by_runs', 'win_by_wickets', 'player_of_match', 'venue',
       'umpire1', 'umpire2', 'umpire3'],
      dtype='object')

In [10]:
ipl.shape

(20736, 39)

## Scatter Plot with Plotly

<p>

    Scatter plots are drawn between to continous variables
    Problem :- We are going to draw a scatter plot between Batsman Avg(X axis) and
    Batsman Strike Rate(Y axis) of the top 50 batsman in IPL(All time)
</p>

In [11]:
# Avg vs SR graph of top 50 bastman

top50 = ipl.groupby('batsman')['batsman_runs'].sum().sort_values(ascending=False).head(50).index.tolist()

new_ipl = ipl[ipl['batsman'].isin(top50)]
new_ipl.shape

(13543, 39)

In [12]:
# Calculating the Strike_rate (SR = (num of runs / num of ball played)*100)

runs = new_ipl.groupby('batsman')['batsman_runs'].sum()
balls = new_ipl.groupby('batsman')['batsman_runs'].count()

In [13]:
sr = ((runs/balls)*100).reset_index()
sr.rename(columns={'batsman':'batsman', 'batsman_runs':'batsman_sr'}, inplace=True)
sr

Unnamed: 0,batsman,batsman_sr
0,AB de Villiers,124.64455
1,AC Gilchrist,147.368421
2,AJ Finch,157.894737
3,AM Rahane,111.239193
4,AR Patel,132.748538
5,BA Stokes,139.207048
6,BB McCullum,157.275542
7,CA Lynn,168.571429
8,CH Gayle,116.27907
9,DA Warner,138.744589


In [14]:
# Now calculating the average run_rate of the batsman (avg = (total runs/num of outs))
# Calculating the num of outs of top 50 batsman

out = ipl[ipl['player_dismissed'].isin(top50)]
nout = out['player_dismissed'].value_counts()

In [15]:
avg = (runs/nout).reset_index()
avg = avg.rename(columns={'index':'batsman', 0:'average'})
avg

Unnamed: 0,batsman,average
0,AB de Villiers,26.3
1,AC Gilchrist,51.333333
2,AJ Finch,25.0
3,AM Rahane,22.705882
4,AR Patel,28.375
5,BA Stokes,31.6
6,BB McCullum,36.285714
7,CA Lynn,49.166667
8,CH Gayle,22.222222
9,DA Warner,58.272727


In [16]:
avg = avg.merge(sr, on='batsman')
avg

Unnamed: 0,batsman,average,batsman_sr
0,AB de Villiers,26.3,124.64455
1,AC Gilchrist,51.333333,147.368421
2,AJ Finch,25.0,157.894737
3,AM Rahane,22.705882,111.239193
4,AR Patel,28.375,132.748538
5,BA Stokes,31.6,139.207048
6,BB McCullum,36.285714,157.275542
7,CA Lynn,49.166667,168.571429
8,CH Gayle,22.222222,116.27907
9,DA Warner,58.272727,138.744589


In [17]:
# Plotting the scatter plot
trace = go.Scatter(x=avg['average'], y=avg['batsman_sr'],
                   mode='markers', text=avg['batsman'],
                   marker={'color':'#00a65a', 'size':12})
data = [trace]
layout=go.Layout(title='Batsman Avg vs SR',
                 xaxis={'title':'Batsman Average'},
                 yaxis={'title':'Batsman Strike Rate'})

fig=go.Figure(data=data, layout=layout)

fig.show(filename='myfile.html')

## Line Plot in Plotly

It's an extension of Scatter plot. Usually used to show a time series data

Year by Year batsman performance

In [18]:
single0 = ipl[ipl['batsman']=='JP Duminy']
performance0 = single0.groupby('season')['batsman_runs'].sum().reset_index()

In [19]:
# Line plot

trace0 = go.Scatter(x=performance0['season'], y=performance0['batsman_runs'],
                   mode= 'lines+markers', marker= {'color':'#00a65a'}, name='JP Duminy')

data = [trace0]
layout = go.Layout(title= 'Year by Year Performance',
                   xaxis={'title':'Season'},
                   yaxis={'title':'Total Runs'})

fig = go.Figure(data=data, layout=layout)

fig.show(filename='myfile.html')

In [20]:
# Doul Line Plot

single = ipl[ipl['batsman']=='V Kohli']
performance = single.groupby('season')['batsman_runs'].sum().reset_index()

single1 = ipl[ipl['batsman']=='MS Dhoni']
performance1 = single1.groupby('season')['batsman_runs'].sum().reset_index()

In [21]:
# Line plot

trace = go.Scatter(x=performance['season'], y=performance['batsman_runs'],
                   mode= 'lines+markers', marker= {'color':'#00a65a'}, name='V Kohli')

trace1 = go.Scatter(x=performance1['season'], y=performance1['batsman_runs'],
                    mode= 'lines+markers', name='MS Dhoni')

data = [trace, trace1]
layout = go.Layout(title= 'Year by Year Performance',
                   xaxis={'title':'Season'},
                   yaxis={'title':'Total Runs'})

fig = go.Figure(data=data, layout=layout)

fig.show(filename='myfile.html')

In [22]:
# Multiple Line Charts

def batsman_perf(*name):

    data=[]

    for i in name:
        single= ipl[ipl['batsman']==i]
        performance= single.groupby('season')['batsman_runs'].sum().reset_index()


        trace= go.Scatter(x=performance['season'], y=performance['batsman_runs'],
                   mode= 'lines + markers', name=i)

        data.append(trace)

    layout = go.Layout(title= 'Batsman Performance',
                   xaxis={'title':'Season'},
                   yaxis={'title':'Total Runs'})

    fig=go.Figure(data=data, layout=layout)

    fig.show(filename='year_by_year')

In [23]:
batsman_perf('DA Warner', 'V Kohli', 'SE Marsh', 'MEK Hussey', 'RG Sharma')

## Bar Plot in Plotly

Used to show relation between one categorical and 1 numerical data

In [24]:
top10 = ipl.groupby('batsman')['batsman_runs'].sum().sort_values(ascending=False).head(10).index.tolist()
top10_df = ipl[ipl['batsman'].isin(top10)]

top10_score = top10_df.groupby('batsman')['batsman_runs'].sum().reset_index()
top10_score

Unnamed: 0,batsman,batsman_runs
0,BB McCullum,508.0
1,DA Warner,641.0
2,G Gambhir,803.0
3,MS Dhoni,519.0
4,PA Patel,491.0
5,RG Sharma,551.0
6,RV Uthappa,599.0
7,S Dhawan,685.0
8,SK Raina,618.0
9,SPD Smith,472.0


In [25]:
# Bar Plot Graph

trace= go.Bar(x=top10_score['batsman'], y=top10_score['batsman_runs'])

data=[trace]

layout= go.Layout(title='Top 10 IPL Batsman',
                  xaxis={'title':'Batsman'},
                  yaxis={'title':'Total Runs'})

fig= go.Figure(data=data, layout=layout)

fig.show()

#### There are other types of Bar Graph, Nested Bar Graph and Stacked Bar Graph, Overlayed Bar Graph

In [26]:
iw = top10_df.groupby(['batsman', 'inning'])['batsman_runs'].sum().reset_index()
mask = iw['inning']==1
mask2 = iw['inning']==2
one=iw[mask]
two=iw[mask2]

one.rename(columns={'batsman_runs': '1st Inning'}, inplace=True)
two.rename(columns={'batsman_runs': '2nd Inning'}, inplace=True)

final=one.merge(two, on='batsman')[['batsman', '1st Inning', '2nd Inning']]

final



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,batsman,1st Inning,2nd Inning
0,BB McCullum,339.0,168.0
1,DA Warner,424.0,217.0
2,G Gambhir,301.0,502.0
3,MS Dhoni,313.0,206.0
4,PA Patel,154.0,337.0
5,RG Sharma,331.0,220.0
6,RV Uthappa,232.0,367.0
7,S Dhawan,457.0,228.0
8,SK Raina,370.0,248.0
9,SPD Smith,244.0,228.0


In [27]:
# overlay
trace1= go.Bar(x=final['batsman'], y=final['1st Inning'], name='1st Inning',
               marker={'color':'#754833'})

trace2= go.Bar(x=final['batsman'], y=final['2nd Inning'], name='2nd Inning',
               marker={'color':'#a6a65a'})

data=[trace1, trace2]

layout= go.Layout(title='Inning-wise Scores',
                  xaxis={'title':'Batsman'},
                  yaxis={'title':'Runs'},
                  barmode='overlay')

fig= go.Figure(data=data, layout=layout)

fig.show()

In [28]:
# stacked
trace1= go.Bar(x=final['batsman'], y=final['1st Inning'], name='1st Inning',
               marker={'color':'#754833'})

trace2= go.Bar(x=final['batsman'], y=final['2nd Inning'], name='2nd Inning',
               marker={'color':'#a6a65a'})

data=[trace1, trace2]

layout= go.Layout(title='Inning-wise Scores',
                  xaxis={'title':'Batsman'},
                  yaxis={'title':'Runs'},
                  barmode='stack')

fig= go.Figure(data=data, layout=layout)

fig.show()

In [29]:
# Nested
trace1= go.Bar(x=final['batsman'], y=final['1st Inning'], name='1st Inning',
               marker={'color':'#754833'})

trace2= go.Bar(x=final['batsman'], y=final['2nd Inning'], name='2nd Inning',
               marker={'color':'#a6a65a'})

data=[trace1, trace2]

layout= go.Layout(title='Inning-wise Scores',
                  xaxis={'title':'Batsman'},
                  yaxis={'title':'Runs'})

fig= go.Figure(data=data, layout=layout)

fig.show()

## Bubble Plot

Again an extension of Scatter plot. with some additional informations

In [30]:
new_ipl= new_ipl[new_ipl['batsman_runs']==6]
six= new_ipl.groupby('batsman')['batsman_runs'].count().reset_index()
x=avg.merge(six, on='batsman')
x

Unnamed: 0,batsman,average,batsman_sr,batsman_runs
0,AB de Villiers,26.3,124.64455,17
1,AC Gilchrist,51.333333,147.368421,18
2,AJ Finch,25.0,157.894737,19
3,AM Rahane,22.705882,111.239193,9
4,AR Patel,28.375,132.748538,11
5,BA Stokes,31.6,139.207048,15
6,BB McCullum,36.285714,157.275542,33
7,CA Lynn,49.166667,168.571429,19
8,CH Gayle,22.222222,116.27907,14
9,DA Warner,58.272727,138.744589,26


In [31]:
trace=go.Scatter(x=x['average'], y=x['batsman_sr'], mode='markers',
                 marker=dict(size=x['batsman_runs'],
                             color=x['average']))

data=[trace]

layout= go.Layout(title='Bubble chart',
                  xaxis={'title':'Average'},
                  yaxis={'title':'SR'})

fig= go.Figure(data=data, layout=layout)

fig.show()

## Box plot in plotly

A box and whisker plot—also called a box plot—displays the five-number summary of a set of data.

In [32]:
match_agg = delivery.groupby(['match_id'])['total_runs'].sum().reset_index()
season_wise=match_agg.merge(matches, left_on='match_id', right_on='id')

season_wise[['match_id', 'total_runs', 'season']]

Unnamed: 0,match_id,total_runs,season
0,1,379.0,2017
1,2,371.0,2017
2,3,367.0,2017
3,4,327.0,2017
4,5,299.0,2017
...,...,...,...
84,85,292.0,2008
85,86,207.0,2008
86,87,375.0,2008
87,88,253.0,2008


In [33]:
trace=go.Box(x=season_wise['total_runs'], name="All Seasons", marker={'color':'#00a65a'})

data=[trace]

layout=go.Layout(title='Tital Score Analysis',
                 xaxis={'title':'Total Score'})

fig=go.Figure(data=data, layout=layout)

fig.show()

In [34]:
trace1=go.Box(x=season_wise[season_wise['season']==2017]['total_runs'], name="2017", marker={'color':'#00a65a'})

trace2=go.Box(x=season_wise[season_wise['season']==2008]['total_runs'], name="2008")

data=[trace1, trace2]

layout=go.Layout(title='Tital Score Analysis',
                 xaxis={'title':'Total Score'})

fig=go.Figure(data=data, layout=layout)

fig.show()

## Dist Plot

A histogram is a plot that lets you discover, and show, the underlying frequency distribution (shape) of a set of continuous data.

In [35]:
import plotly.figure_factory as ff

In [36]:
avg

Unnamed: 0,batsman,average,batsman_sr
0,AB de Villiers,26.3,124.64455
1,AC Gilchrist,51.333333,147.368421
2,AJ Finch,25.0,157.894737
3,AM Rahane,22.705882,111.239193
4,AR Patel,28.375,132.748538
5,BA Stokes,31.6,139.207048
6,BB McCullum,36.285714,157.275542
7,CA Lynn,49.166667,168.571429
8,CH Gayle,22.222222,116.27907
9,DA Warner,58.272727,138.744589


In [37]:
hist_data = [avg['average']]
group_labels=['Average']

fig=ff.create_distplot(hist_data, group_labels)

fig.show()

In [38]:
# multiple distplots

hist_data = [avg['average'], avg['batsman_sr']]
group_labels=['Average', 'Strike_rate']

fig=ff.create_distplot(hist_data, group_labels, bin_size=[10, 20])

fig.show()

## Histogram

A Histogram is a plot that discover and show the underlying frequency distribution (shape) of a set of continous data

In [43]:
# Histogram
x = delivery.groupby('batsman')['batsman_runs'].count()>150
x =x[x].index.tolist()

new=delivery[delivery['batsman'].isin(x)]

runs=new.groupby('batsman')['batsman_runs'].sum()
balls=new.groupby('batsman')['batsman_runs'].count()

sr=(runs/balls)*100

sr=sr.reset_index().rename(columns={'batsman':'batsman', 'batsman_runs':'batsman_sr'})
sr

Unnamed: 0,batsman,batsman_sr
0,AB de Villiers,124.64455
1,AC Gilchrist,147.368421
2,AJ Finch,157.894737
3,AM Rahane,111.239193
4,AR Patel,132.748538
5,BA Stokes,139.207048
6,BB McCullum,157.275542
7,CA Lynn,168.571429
8,CH Gayle,116.27907
9,DA Warner,138.744589


In [47]:
trace= go.Histogram(x=sr['batsman_sr'], xbins={'size':5}) # a parameter called ('start':50, 'end':100) start from and end with

data=[trace]

layout=go.Layout(title='Strike Rate Analysis',
                 xaxis={'title':'Strike Rates'})

fig=go.Figure(data=data, layout=layout)

fig.show()

## Heatmaps

A heat map is a graphical representation of data where the individual values contained in a matrix are represented as colors

In [48]:
# Heatmaps

six=delivery[delivery['batsman_runs']==6]
six=six.groupby(['batting_team', 'over'])['batsman_runs'].count().reset_index()
six

Unnamed: 0,batting_team,over,batsman_runs
0,Chennai Super Kings,3,1
1,Chennai Super Kings,6,2
2,Chennai Super Kings,7,2
3,Chennai Super Kings,8,2
4,Chennai Super Kings,9,2
...,...,...,...
198,Sunrisers Hyderabad,16,1
199,Sunrisers Hyderabad,17,3
200,Sunrisers Hyderabad,18,5
201,Sunrisers Hyderabad,19,5


In [49]:
# plotting the heatmap

trace=go.Heatmap(x=six['batting_team'], y=six['over'], z=six['batsman_runs'])

data=[trace]

layout=go.Layout(title='Sixer heatmap')

fig=go.Figure(data=data, layout=layout)

fig.show()

In [50]:
dots=delivery[delivery['batsman_runs']==0]
dots=six.groupby(['batting_team', 'over'])['batsman_runs'].count().reset_index()
dots

Unnamed: 0,batting_team,over,batsman_runs
0,Chennai Super Kings,3,1
1,Chennai Super Kings,6,1
2,Chennai Super Kings,7,1
3,Chennai Super Kings,8,1
4,Chennai Super Kings,9,1
...,...,...,...
198,Sunrisers Hyderabad,16,1
199,Sunrisers Hyderabad,17,1
200,Sunrisers Hyderabad,18,1
201,Sunrisers Hyderabad,19,1


In [53]:
from plotly import tools

trace1=go.Heatmap(x=six['batting_team'], y=six['over'], z=six['batsman_runs'].values.tolist())

trace2=go.Heatmap(x=dots['batting_team'], y=dots['over'], z=dots['batsman_runs'].values.tolist())

fig=tools.make_subplots(rows=1, cols=2, subplot_titles=["6's", "0's"], shared_yaxes=True)

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)

fig.show()