# Using plotly
## Scatterplots

In [1]:
#We can interpret some form of correlation

In [1]:
import numpy as np
import plotly.offline as pyo
import plotly.graph_objs as go

In [3]:
np.random.seed(42)
random_x = np.random.randint(1,101,100) #100 random integers
random_y = np.random.randint(1,101,100) 

In [14]:
#plotly syntax: plot in a variable called data and is passed into a plot call
#so plot inside of a list
data = [go.Scatter(x=random_x,
                   y =random_y,
                   mode = 'markers', 
                  marker = dict(size = 12,
                               color = "rgb(51,204,153)",
                               symbol = "pentagon",
                               line = dict(width = 2)
                               ))] #this is our data
#layout
layout = go.Layout(title = "Hello First plot",
                  xaxis = {"title":"MY X AXIS"},
                  yaxis = dict(title = "MY Y AXIS"), #both approaches for passing a dictionary is ok, but plotly likes these dict calls
                  hovermode = "closest")

pyo.plot(data, filename = "scatter.html") #saves the plot

fig = go.Figure(data=data,layout=layout)
pyo.plot(fig,filename="scatter.html")

'scatter.html'

## Line Charts
- Displays series of data points called markers joined by line segments
- x-value is ordered in some way

In [24]:
np.random.seed(56)
x_values=np.linspace(0,1,100)
y_values = np.random.randn(100)

trace0 = go.Scatter(x=x_values,y=y_values+5,mode = "markers",
                  name = "markers")
trace1 = go.Scatter(x=x_values,y=y_values,mode = "lines",name="mylines")

trace2 = go.Scatter(x=x_values,y=y_values-5,mode = "lines+markers",name="myfavorite")

data = [trace0,trace1,trace2]

layout = go.Layout(title = "Line Charts")

fig = go.Figure(data=data,layout=layout)

pyo.plot(fig)

'temp-plot.html'

## Using real data set with pandas

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("nst-est2017-alldata.csv")

df2 = df[df["DIVISION"] == "1"]#only want those obs with a 1 in the DIVISON col
df2.set_index("NAME", inplace=True) #grabs col in df and sets that as the index i.e row name
df2

Unnamed: 0_level_0,SUMLEV,REGION,DIVISION,STATE,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,...,RDOMESTICMIG2015,RDOMESTICMIG2016,RDOMESTICMIG2017,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Connecticut,40.0,1,1,9.0,3574097.0,3574114.0,3580171.0,3591927.0,3597705.0,3602470.0,...,-8.376089,-8.106331,-6.206914,0.993851,-0.542726,-0.420268,-2.479362,-3.464252,-3.112421,-1.257548
Maine,40.0,1,1,23.0,1328361.0,1328362.0,1327568.0,1327968.0,1328101.0,1327975.0,...,-0.781424,1.807361,4.032798,0.685361,0.178459,0.392308,1.301528,0.389959,3.000731,5.216532
Massachusetts,40.0,1,1,25.0,6547629.0,6547808.0,6564943.0,6612178.0,6659627.0,6711138.0,...,-3.270088,-4.423353,-3.374712,4.364383,4.266338,5.21137,4.538697,3.276287,2.261905,3.24609
New Hampshire,40.0,1,1,33.0,1316470.0,1316460.0,1316700.0,1318345.0,1320923.0,1322622.0,...,-0.850002,1.333509,3.500622,-0.403029,0.632751,0.061281,3.793602,0.852258,3.037729,5.170643
Rhode Island,40.0,1,1,44.0,1052567.0,1052945.0,1053169.0,1052154.0,1052761.0,1052784.0,...,-4.21851,-4.093718,-3.640649,-2.170688,-0.991964,-1.139847,1.029624,0.368598,0.526146,0.891742
Vermont,40.0,1,1,50.0,625741.0,625741.0,625842.0,626210.0,625606.0,626044.0,...,-3.415672,-3.615938,-1.472321,-0.333852,-1.915617,0.131027,-1.313404,-1.932614,-2.115708,0.024058


In [64]:
#If want only population columns
#list comprehension
list_of_pop_col = [col for col in df2.columns if col.startswith("POP")]

#df2.columns gives list of col names, then wer're syaig for col in df2.columns, we seect them nly if it starts with col.startswith("POP")
#so it will keep col for every col in df2.cols that starts with POP
df2 = df2[list_of_pop_col]

data = [go.Scatter(x=df2.columns,y=df2.loc[name],mode="lines",name=name) for name in df2.index] 
#build a scatterplot for every state in df2 index''#set x = columns, y .loc grabs the number value - so for every name eg. Connecticut grab the value
pyo.plot(data)

'temp-plot.html'

## Exercise

In [7]:
#Use file to make line chart that plots seven days worth of temp data on a graph
df = pd.read_csv("2010YumaAZ.csv")
days = list(df["DAY"].unique())
print(df.head())

data = []
for day in days:
    #for y axis temp hr avg, we want the data to be according to corresponding day
    #if we dont specify day, then it will take temp avg for all days and graph will be wrong
    #bc each line represents a separate day
    trace = go.Scatter(x=df["LST_TIME"],
                       y=df[df["DAY"]==day]["T_HR_AVG"]
                       ,mode="lines"
                       ,name = day)
    data.append(trace)

layout = go.Layout(title="Daily temp avg")
    
fig = go.Figure(data=data,layout=layout)
pyo.plot(fig)

   LST_DATE      DAY LST_TIME  T_HR_AVG
0  20100601  TUESDAY     0:00      25.2
1  20100601  TUESDAY     1:00      24.1
2  20100601  TUESDAY     2:00      24.4
3  20100601  TUESDAY     3:00      24.9
4  20100601  TUESDAY     4:00      22.8


'temp-plot.html'

## Exercise

In [79]:
data = [{"x":df["LST_TIME"],
        "y": df[df["DAY"] == day]["T_HR_AVG"],
         "name":day
        } for day in df["DAY"].unique()]

layout = go.Layout(title = "Daily temp avgs")

## Bar Charts
- Stacked, normal, nested bar charts,  

In [96]:
df = pd.read_csv("2018WinterOlympics.csv")

data = [go.Bar(x=df["NOC"],y=df["Total"])]
layout=go.Layout(title="Medals")
fig = go.Figure(data,layout)
pyo.plot(fig)
df

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total
0,1,Norway,14,14,11,39
1,2,Germany,14,10,7,31
2,3,Canada,11,8,10,29
3,4,United States,9,8,6,23
4,5,Netherlands,8,6,6,20
5,6,Sweden,7,6,1,14
6,7,Republic of Korea,5,8,4,17
7,8,Switzerland,5,6,4,15
8,9,France,5,4,6,15
9,10,Austria,5,3,6,14


In [83]:
#Nested BAr Chart
#created traces for each metal
trace1 = go.Bar(x=df["NOC"],y=df["Gold"],
                name="Gold",marker={"color": "#FFD700"})
trace2 = go.Bar(x=df["NOC"],y=df["Silver"],
                name="Silver",marker={"color": "#9EA0A1"})
trace3 = go.Bar(x=df["NOC"],y=df["Bronze"],
                name="Bronze",marker={"color": "#CD7F32"})

data = [trace1,trace2,trace3]
layout=go.Layout(title="Medals") #change to stacked by writing barmode="stack"
fig = go.Figure(data=data,layout=layout)
pyo.plot(fig)

'temp-plot.html'

### Bar chart exercise: Create a stacked bar chart

In [98]:
df = pd.read_csv("mocksurvey.csv")
print(df)

   Unnamed: 0  Strongly Agree  Somewhat Agree  Neutral  Somewhat Disagree  \
0  Question 1            0.45            0.25     0.10               0.12   
1  Question 2            0.12            0.07     0.48               0.18   
2  Question 3            0.05            0.22     0.19               0.23   

   Strongly Disagree  
0               0.08  
1               0.15  
2               0.31  


In [99]:
data = [go.Bar(x=df.index,y=df[response],name=response) for response in df.columns]
#x represents categories which should be unique, and y represents values of columns per category
#orientation = "h" gives us horizontal barplot
layout = go.Layout(title = "Survey Results", barmode = "stack")

fig = go.Figure(data=data,layout=layout)
pyo.plot(fig)

'temp-plot.html'

## Bubble Charts
- similar to scatter plots except we now convey a third variable's info through size of markers

In [8]:
df = pd.read_csv("mpg.csv")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [102]:
data = [go.Scatter(x=df["horsepower"],y=df["mpg"],
                  text = df["name"],
                  mode = "markers",
                  marker=dict(size=2*df["cylinders"]))] #text will display actual model of the car when you hover
#so some markers with more cylinders will be larger

In [103]:
layout=go.Layout(title="bubble chart")
fig = go.Figure(data=data,layout=layout)
pyo.plot(fig)

'temp-plot.html'

In [106]:
#Make bubble chart to some other value
#weight
data = [go.Scatter(x=df["horsepower"],y=df["mpg"],
                  text = df["name"],
                  mode = "markers",
                  marker=dict(size=df["weight"]/100, color = df["cylinders"],showscale=True))] #adjust bubble size by using marker, color according to cylinder factor too
layout=go.Layout(title="bubble chart")
fig = go.Figure(data=data,layout=layout)
pyo.plot(fig)

'temp-plot.html'

### Task: Create bubble chart that cmpares three other features from mpg.csv dataset.

In [107]:
data = [go.Scatter(x=df["displacement"],
                  y = df["acceleration"],
                  text = df["name"],
                  mode = "markers",
                  marker = dict(size=df["weight"]/400))]
layout = go.Layout(title="Bubble solution",hovermode="closest")
fig = go.Figure(data=data,layout=layout)
pyo.plot(fig)

'temp-plot.html'

## Boxplots

In [120]:
y = [1,14,14,15,16,18,18,19,19,20,20,23,24,26,27,27,28,29,33,54]
y

[1, 14, 14, 15, 16, 18, 18, 19, 19, 20, 20, 23, 24, 26, 27, 27, 28, 29, 33, 54]

In [122]:
data = [go.Box(y=y,boxpoints="all",jitter=0.3,pointpos=0)]
#boxpoints gives all data points, jitter will spread out values so all data points aper
#pointpos = 0 puts points right on top of the box, positive nos. are to the right of the boxplot
pyo.plot(data)

'temp-plot.html'

In [124]:
#plotting two points
snodgrass = [.209,.205,.196,.210,.202,.207,.224,.223,.220,.201]
twain = [.225,.262,.217,.240,.230,.229,.235,.217]

data = [go.Box(y=snodgrass,name="Snodgrass"),
       go.Box(y=twain,name="Twain")]
pyo.plot(data)

'temp-plot.html'

### Exercise: Make df using abalone dataset and take two ind random samples of diff sizes from rings field. Use boxplots to show that samples derive from same population

In [134]:
df = pd.read_csv("abalone.csv")
samp1 = np.random.choice(df["rings"],30,replace=False)
samp2 = np.random.choice(df["rings"],20,replace=False)

In [135]:
data = [go.Box(y=samp1, name = "Sample1"),
       go.Box(y=samp2, name = "Sample2")]
pyo.plot(data)

'temp-plot.html'

## Histograms
- Histogram is good for representing interval data (data with a range)

In [3]:
import pandas as pd

In [5]:
df = pd.read_csv("mpg.csv")

In [9]:
data = [go.Histogram(x=df["mpg"],xbins=dict(start=0,end=25,size=10))] #show only cars from 0-25 mpg
#represents size of each bin

layout = go.Layout(title = "Histogram")
fig = go.Figure(data=data,layout=layout)
pyo.plot(fig)

'temp-plot.html'

## Histogram Exercise
- Create histogram that plots length field from Abalone dataset
- Set range form 0-1, bin size 0.02

In [13]:
df = pd.read_csv("abalone.csv")
print(df.head())

  sex  length  diameter  height  whole_weight  shucked_weight  viscera_weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   

   shell_weight  rings  
0         0.150     15  
1         0.070      7  
2         0.210      9  
3         0.155     10  
4         0.055      7  


In [14]:
data = [go.Histogram(x = df["length"], xbins=dict(start=0,end=1,size=0.02))]
layout = go.Layout(title = "Histogram")
figure = go.Figure(data=data,layout=layout)
pyo.plot(figure)

'temp-plot.html'

## Distribution plots

In [15]:
import plotly.figure_factory as ff

In [16]:
x = np.random.randn(200) #normally distirbuted data thats random

In [17]:
hist_data = [x]
group_labels = ["distplot"]

fig = ff.create_distplot(hist_data, group_labels)
pyo.plot(fig)

'temp-plot.html'

In [18]:
x1 = np.random.randn(200)-2
x2 = np.random.randn(200)
x3 = np.random.randn(200)+2
x4 = np.random.randn(200)+4

In [22]:
hist_data = [x1,x2,x3,x4]
group_labels =  ["X1","X2","X3","X4"]

fig = ff.create_distplot(hist_data,group_labels,bin_size=[.2,.1,.3,.4]) #bin size is in order of our data
#bin size will be adjusted according to the order of data

pyo.plot(fig)

'temp-plot.html'

### More realistic data

In [23]:
snodgrass = [.209,.205,.196,.210,.202,.207,.224,.223,.220,.201]
twain = [.225,.262,.217,.240,.230,.229,.235,.217]

hist_data = [snodgrass,twain]
group_labels = ["Snodgrass Writings", "Mark Twain Writings"]

fig = ff.create_distplot(hist_data,group_labels,bin_size = [0.005, 0.005])
pyo.plot(fig)

'temp-plot.html'

## Dist plot exercise
- Dist plot of petal length for each class