In [1]:
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks

# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
from plotly.offline import iplot

cufflinks.go_offline()

# Set global theme
cufflinks.set_config_file(world_readable=True, theme="pearl")

In [3]:
df = pd.read_parquet(
    "https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/data/medium_data_2019_01_06?raw=true"
)
df.head()

Unnamed: 0,claps,days_since_publication,fans,link,num_responses,publication,published_date,read_ratio,read_time,reads,started_date,tags,text,title,title_word_count,type,views,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python
119,2,574.858594,2,https://medium.com/p/screw-the-environment-but...,0,,2017-06-10 14:25:00,41.98,7,68,2017-06-10 14:24:00,"[Climate Change, Economics]","Screw the Environment, but Consider Your Walle...","Screw the Environment, but Consider Your Wallet",8,published,162,1859,0.001076,0,0,0,0,0,0
118,18,567.540639,3,https://medium.com/p/the-vanquishing-of-war-pl...,0,,2017-06-17 22:02:00,32.93,14,54,2017-06-17 22:02:00,"[Climate Change, Humanity, Optimism, History]","The Vanquishing of War, Plague and Famine Part...","The Vanquishing of War, Plague and Famine",8,published,164,3891,0.004626,0,0,0,0,0,0
121,50,554.920762,19,https://medium.com/p/capstone-project-mercedes...,0,,2017-06-30 12:55:00,20.19,42,215,2017-06-30 12:00:00,"[Machine Learning, Python, Udacity, Kaggle]",Capstone Project: Mercedes-Benz Greener Manufa...,Capstone Project: Mercedes-Benz Greener Manufa...,7,published,1065,12025,0.004158,0,0,0,0,1,1
122,0,554.07816,0,https://medium.com/p/home-of-the-scared-5af0fe...,0,,2017-07-01 09:08:00,35.85,9,19,2017-06-30 18:21:00,"[Politics, Books, News, Media Criticism]",Home of the Scared A review of A Culture of Fe...,Home of the Scared,4,published,53,2533,0.0,0,0,0,0,0,0
114,0,550.090507,0,https://medium.com/p/the-triumph-of-peace-f485...,0,,2017-07-05 08:51:00,8.77,14,5,2017-07-03 20:18:00,"[Books, Psychology, History, Humanism]",The Triumph of Peace A review of The Better An...,The Triumph of Peace,4,published,57,3892,0.0,1,0,0,0,0,0


In [4]:
#iplot står för plottandet, visa box select, lasso select, download
df["claps"].iplot(
    kind="hist",
    bins=30,
    xTitle="claps",
    linecolor="black",
    yTitle="count",
    title="Claps Distribution",
)

In [5]:
df["claps"].iplot(
    kind="hist",
    bins=30,
    xTitle="reads",
    linecolor="black",
    histnorm="percent",
    yTitle="percentage (%)",
    title="Reads Distribution in Percent",
)

In [6]:
def to_time(dt):
    return dt.hour + dt.minute / 60


In [7]:
df["time_started"] = df["started_date"].apply(to_time)
df["time_published"] = df["published_date"].apply(to_time)

df[["time_started", "time_published"]].iplot(
    kind="hist",
    linecolor="black",
    bins=24,
    histnorm="percent",
    bargap=0.1,
    opacity=0.8,
    barmode="group",
    xTitle="Time of Day",
    yTitle="(%) of Articles",
    title="Time Started and Time Published",
)

In [8]:
df[["time_published", "time_started"]].iplot(
    kind="hist",
    bins=24,
    linecolor="black",
    opacity=0.8,
    histnorm="percent",
    barmode="overlay",
    xTitle="Time of day",
    yTitle="(%) of articles",
    title="Time Started and Time Published Overlaid",
)

In [9]:
df.groupby("publication").count()["fans"].iplot(
    kind="bar",
    yTitle="Number of Articles",
    linecolor="black",
    title="Articles by Publication",
)

In [10]:
df[[c for c in df if "<tag>" in c]].sum().iplot(
    kind="bar",
    xTitle="Tag",
    yTitle="Number of Articles with Tag",
    title="Frequency of Tags",
    linecolor="black",
    sortbars=True,
)

In [11]:
df2 = (
    df[["views", "reads", "published_date"]]
    .set_index("published_date")
    .resample("M")
    .mean()
)
df2.head()

Unnamed: 0_level_0,views,reads
published_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-06-30,463.666667,112.333333
2017-07-31,5521.333333,1207.166667
2017-08-31,6242.8,993.7
2017-09-30,2113.0,279.0
2017-10-31,,


In [12]:
df2.iplot(
    kind="bar", xTitle="Date", yTitle="Average", title="Monthly Average Views and Reads"
)

In [13]:
df2 = (
    df[["views", "read_time", "published_date"]]
    .set_index("published_date")
    .resample("M")
    .mean()
)

df2.iplot(
    kind="bar",
    xTitle="Date",
    secondary_y="read_time",
    secondary_y_title="Average Read Time",
    yTitle="Average Views",
    title="Monthly Averages",
)

In [18]:
df[["claps", "fans"]].iplot(
    kind="box",
    secondary_y="fans",
    secondary_y_title="Fans",
    yTitle="Claps",
    title="Box Plot of Claps and Fans",
)

In [19]:
df2 = df.pivot(columns="publication", values="fans")
df2.head()

publication,Engineering @ Feature Labs,None,Noteworthy - The Journal Blog,The Reality Project,Towards Data Science
0,,,,34.0,
1,,,,29.0,
2,,,,13.0,
3,,34.0,,,
4,,47.0,,,


In [20]:
df2.iplot(
    kind="box",
    layout=dict(
        height=600,
        yaxis=dict(title="fans"),
        title="Fans by Publication",
        margin=dict(b=140),
    ),
)

In [20]:
df[df["read_time"] <= 10].pivot(columns="read_time", values="reads").iplot(
    kind="box",
    colorscale="set2",
    xTitle="Read Time",
    yTitle="Number of Reads",
    title="Box Plot of Reads by Reading Time",
)

In [21]:
tds = df[df["publication"] == "Towards Data Science"].set_index("published_date")

tds.head()

Unnamed: 0_level_0,claps,days_since_publication,fans,link,num_responses,publication,read_ratio,read_time,reads,started_date,tags,text,title,title_word_count,type,views,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python,time_started,time_published
published_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2017-12-27 11:20:00,4800,374.986885,861,https://towardsdatascience.com/random-forest-i...,27,Towards Data Science,17.68,21,28566,2017-12-26 15:11:00,"[Machine Learning, Data Science, Random Forest...",Random Forest in Python A Practical End-to-End...,Random Forest in Python,4,published,161596,4494,1.068091,0,0,1,0,1,1,15.183333,11.333333
2018-01-06 20:15:00,857,364.615092,112,https://towardsdatascience.com/improving-rando...,6,Towards Data Science,22.76,17,7207,2018-01-03 21:38:00,"[Machine Learning, Python, Data Science, Learn...",Improving the Random Forest in Python Part 1 G...,Improving the Random Forest in Python Part 1,8,published,31659,3504,0.244578,2,0,1,0,1,1,21.633333,20.25
2018-01-07 20:37:00,186,363.599979,45,https://towardsdatascience.com/data-science-a-...,1,Towards Data Science,28.64,15,775,2018-01-07 13:18:00,"[Data Science, Data, Weight Loss, Programming]",Data Science: A Personal Application Charting ...,Data Science: A Personal Application,5,published,2706,3569,0.052115,0,0,1,0,0,0,13.3,20.616667
2018-01-08 16:58:00,119,362.752029,43,https://towardsdatascience.com/a-theory-of-pre...,2,Towards Data Science,31.53,11,740,2018-01-02 17:23:00,"[Statistics, Data Science, Book Review, Books]",A Theory of Prediction Review of The Signal an...,A Theory of Prediction,4,published,2347,2817,0.042244,5,0,1,0,0,0,17.383333,16.966667
2018-01-09 21:49:00,2000,361.550093,392,https://towardsdatascience.com/hyperparameter-...,12,Towards Data Science,23.99,12,25505,2018-01-09 12:26:00,"[Machine Learning, Python, Data Science, Data]",Hyperparameter Tuning the Random Forest in Pyt...,Hyperparameter Tuning the Random Forest in Python,7,published,106311,2456,0.814332,0,0,1,0,1,1,12.433333,21.816667


In [27]:
tds["read_time"].iplot(
    mode="lines+markers",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Read Time (min)",
    title="Read Time Trends",
)

In [28]:
tds[["claps", "fans"]].iplot(
    mode="lines+markers",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Fans and Claps",
    title="Fans and Claps over Time",
)

In [33]:
tds[["fans", "word_count", "title", 'claps']].iplot(
    y="fans",
    mode="lines+markers",
    secondary_y="word_count",
    secondary_y_title="Word Count",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Claps",
    text="title",
    title="Fans and Word Count over Time",
)

In [39]:
tds[["fans", "word_count", 'claps', 'title']].iplot(
    mode="lines+markers",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Claps",
    text = 'title',
    title="Fans and Word Count over Time",
)

In [34]:
tds_monthly_totals = tds.resample("M").sum()
print(tds_monthly_totals.head())
print(tds_monthly_totals.shape)

                claps  days_since_publication  fans  num_responses  \
published_date                                                       
2017-12-31       4800              374.986885   861             27   
2018-01-31      26739             4595.367769  5297            117   
2018-02-28      30346             2299.709918  5533            131   
2018-03-31      22500             1767.877657  4257            102   
2018-04-30      18664             1568.069032  3745             58   

                read_ratio  read_time   reads  title_word_count   views  \
published_date                                                            
2017-12-31           17.68         21   28566                 4  161596   
2018-01-31          462.34        130  161342                75  563646   
2018-02-28          283.11         52   79338                39  203484   
2018-03-31          176.90         62  113304                50  416241   
2018-04-30          217.26         52   68042              

In [35]:
for i, j in zip(range(10), range(11)):
    print(i, j)

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9


In [36]:
for i in range(10):
    print(i)
for j in range(10):
    print(j)

0
1
2
3
4
5
6
7
8
9
0
1
2
3
4
5
6
7
8
9


In [37]:

for i in range(10):
    for j in range(10):
        print(i, j)


0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
3 0
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
4 0
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
5 0
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
6 0
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
7 0
7 1
7 2
7 3
7 4
7 5
7 6
7 7
7 8
7 9
8 0
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
8 9
9 0
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9


In [39]:

tds_monthly_totals["text"] = [
    f'<span style="color:blue">{m}<span><br>words: {w:.0f}'
    for m, w in zip(
        tds_monthly_totals.index.month_name(), tds_monthly_totals["word_count"]
    )
]

tds_monthly_totals.iplot(
    mode="lines+markers+text",
    text="text",
    y="word_count",
    opacity=0.8,
    xTitle="Date",
    yTitle="Word Count",
    title="Total Word Count by Month",
)

In [40]:
tds.iplot(
    x="read_time",
    y="read_ratio",
    mode="markers",
    text="title",
    xTitle="Read Time",
    yTitle="Reading Percent",
    title="Reading Percent vs Reading Time",
)

In [30]:
tds.sort_values("read_time").iplot(
    x="read_time",
    y="read_ratio",
    xTitle="Read Time",
    yTitle="Reading Percent",
    text="title",
    mode="markers+lines",
    bestfit=True,
    bestfit_colors=["blue"],
    title="Reading Percent vs Reading Time",
)

In [42]:
layout = dict(
    xaxis=dict(type="log", title="Word Count"),
    yaxis=dict(type="linear", title="views"),
    title="Views vs Word Count Log Axis",
)

df.sort_values("word_count").iplot(
    x="word_count",
    y="views",
    layout=layout,
    text="title",
    mode="markers",
    bestfit=True,
    bestfit_colors=["blue"],
)

In [32]:
layout = dict(
    xaxis=dict(type="log", title="Word Count"),
    yaxis=dict(title="views"),
    title="Views vs Word Count Log Axis",
)

tds.sort_values("word_count").iplot(
    x="word_count",
    y="views",
    layout=layout,
    text="title",
    mode="markers",
    bestfit=True,
    bestfit_colors=["blue"],
)

In [33]:
df.iplot(
    x="read_time",
    y="read_ratio",
    categories="publication",
    xTitle="Read Time",
    yTitle="Reading Percent",
    title="Reading Percent vs Read Time by Publication",
)


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.



In [34]:
df.set_index("published_date")[["views", "word_count"]].cumsum().iplot(
    y="views",
    secondary_y="word_count",
    yTitle="Views",
    secondary_y_title="Word Count",
    title="Views and Word Count Totals",
)

In [35]:
df.pivot_table(
    values="views", index="published_date", columns="publication"
).cumsum().iplot(
    mode="markers+lines",
    size=8,
    symbol=[1, 2, 3, 4, 5],
    layout=dict(
        xaxis=dict(title="Date"),
        yaxis=dict(type="log", title="Total Views"),
        title="Total Views over Time by Publication",
    ),
)

In [36]:
tds[["word_count", "reads", "read_ratio", "title"]].iplot(
    x="word_count",
    y="reads",
    secondary_y="read_ratio",
    xTitle="Word Count",
    yTitle="Reads",
    secondary_y_title="Read Ratio",
    mode="markers",
    size=10,
    text="title",
    title="Reads and Read Ratio vs Number of Words",
)

In [37]:
df.iplot(
    x="word_count",
    y="views",
    categories="publication",
    mode="markers",
    text="title",
    size=8,
    layout=dict(
        xaxis=dict(title="Word Count"),
        yaxis=dict(title="Views"),
        title="Views vs Word Count by Publication",
    ),
)


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.



In [38]:
text = [
    f"Title: {t} <br> Ratio: {r:.2f}%" for t, r in zip(tds["title"], tds["read_ratio"])
]

tds.iplot(
    x="word_count",
    y="reads",
    opacity=0.8,
    size=tds["read_ratio"],
    text=text,
    mode="markers",
    theme="pearl",
    layout=dict(
        xaxis=dict(type="log", title="Word Count"),
        yaxis=dict(title="Reads"),
        title="Reads vs Log Word Count Sized by Read Ratio",
    ),
)

In [39]:
data = [
    go.Scatter(
        x=df["word_count"],
        y=df["reads"],
        text=df["title"],
        mode="markers",
        marker=dict(
            sizemin=10,
            size=df["read_ratio"],
            colorscale="Rainbow",
            showscale=True,
            color=df["read_time"],
            line=dict(color="black", width=1.2),
        ),
    )
]

figure = go.Figure(
    data=data,
    layout=go.Layout(
        xaxis=dict(title="Word Count"),
        yaxis=dict(title="Reads"),
        title="Reads vs Word Count Colored by Read Time and Sized by Read Ratio",
    ),
)

iplot(figure)

In [40]:
data = [
    go.Scatter(
        x=grouped["word_count"],
        y=grouped["reads"],
        text=grouped["title"],
        mode="markers",
        name=name,
        marker=dict(
            symbol=i,
            sizemin=2,
            sizeref=2,
            size=grouped["read_ratio"],
            colorscale="BlueRed",
            showscale=True,
            color=df["read_time"],
            line=dict(color="black", width=1.2),
        ),
    )
    for i, (name, grouped) in enumerate(df.groupby("publication"))
]

figure = go.Figure(
    data=data,
    layout=go.Layout(
        legend=dict(x=1.2),
        width=1000,
        margin=dict(r=20),
        xaxis=dict(type="log", title="Word Count"),
        yaxis=dict(title="Reads"),
        title="Reads vs Word Count Colored by Read Time, Sized by Read Ratio, and Shaped by Publication",
    ),
)

iplot(figure)

In [41]:
tds[["read_time", "fans", "title"]].iplot(
    y="read_time",
    mode="lines+markers",
    secondary_y="fans",
    secondary_y_title="Fans",
    vline=[
        dict(x=pd.to_datetime("2018-04-01"), color="red", dash="dash", width=3),
        dict(x=pd.to_datetime("2018-06-01"), color="red", dash="dash", width=3),
    ],
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Read Time",
    text="title",
    title="Reading Time and Claps over Time",
)

In [42]:
tds[["read_time", "fans", "title"]].iplot(
    y="read_time",
    mode="lines+markers",
    secondary_y="fans",
    secondary_y_title="Fans",
    vspan=[
        dict(x0="2018-05-01", x1="2018-08-01", color="green", fill="green", opacity=0.2)
    ],
    hline=[dict(y=i, color="red", dash="dash", width=1) for i in [5, 10, 15, 20]],
    xrange=("2018-01-01", "2019-01-01"),
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Read Time",
    text="title",
    title="Reading Time and Claps over Time",
)

In [43]:
tds["read_time"].iplot(
    mode="lines+markers",
    vline=[
        dict(x=pd.to_datetime("2018-01-01"), color="blue", dash="dash"),
        dict(x=pd.to_datetime("2019-01-01"), color="blue", dash="dash"),
    ],
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Read Time (min)",
    title="Read Time Trends",
)

In [44]:
colorscales = [
    "Greys",
    "YlGnBu",
    "Greens",
    "YlOrRd",
    "Bluered",
    "RdBu",
    "Reds",
    "Blues",
    "Picnic",
    "Rainbow",
    "Portland",
    "Jet",
    "Hot",
    "Blackbody",
    "Earth",
    "Electric",
    "Viridis",
    "Cividis",
]

In [45]:
import plotly.figure_factory as ff

figure = ff.create_scatterplotmatrix(
    df[["claps", "publication", "views", "read_ratio", "word_count"]],
    height=1000,
    width=1000,
    text=df["title"],
    diag="histogram",
    index="publication",
)
iplot(figure)

In [46]:
colorscales = [
    "Greys",
    "YlGnBu",
    "Greens",
    "YlOrRd",
    "Bluered",
    "RdBu",
    "Reds",
    "Blues",
    "Picnic",
    "Rainbow",
    "Portland",
    "Jet",
    "Hot",
    "Blackbody",
    "Earth",
    "Electric",
    "Viridis",
    "Cividis",
]

In [47]:
corrs = df.corr()

figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    colorscale="Earth",
    annotation_text=corrs.round(2).values,
    showscale=True,
    reversescale=True,
)

figure.layout.margin = dict(l=200, t=200)
figure.layout.height = 800
figure.layout.width = 1000

iplot(figure)

In [48]:
def categorizer(text):
    if len(text) < 15:
        return "Very Short"
    if len(text) < 20:
        return "Short"
    if len(text) < 40:
        return "Medium"
    if len(text) < 60:
        return "Long"
    else:
        return "Very Long"
def lengthenizer(text):
    return len(text)

df["Length"] = df["title"].apply(categorizer)

df.iplot(
    x="Length",
    y="read_ratio",
    categories="publication",
    mode="markers",
    text="title",
    size=df["word_count"]/500,
    layout=dict(
        xaxis=dict(title="Word Count"),
        yaxis=dict(title="Views"),
        title="Views vs Word Count by Publication",
    ),
)
df["Length"] = df["title"].apply(lengthenizer)



df.iplot(
    x="Length",
    y="read_ratio",
    categories="publication",
    mode="markers",
    text="title",
    size=df["word_count"]/500,
    layout=dict(
        xaxis=dict(title="Word Count"),
        yaxis=dict(title="Views"),
        title="Views vs Word Count by Publication",
    ),
)





The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.



In [49]:
df2 = df[df["<tag>Python"] == 1].set_index("published_date")
df2["read_ratio"].iplot(
    mode="lines+markers",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Read Ratio",
    title="Read Ratio trends Python",
)

df3 = df[df["<tag>Education"] == 1].set_index("published_date")

df3["read_ratio"].iplot(
    mode="lines+markers",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Read Ratio",
    title="Read Ratio trends Education",
)
print(df3.head())

                     claps  days_since_publication  fans  \
published_date                                             
2017-12-16 10:20:00      8              386.028602     2   
2018-01-08 09:45:00     11              363.052874     3   
2018-01-19 20:16:00   1200              351.614554   208   
2018-01-22 14:22:00    275              348.860352    47   
2018-01-24 18:27:00   1600              346.690477   288   

                                                                  link  \
published_date                                                           
2017-12-16 10:20:00  https://medium.com/p/the-case-for-criticism-96...   
2018-01-08 09:45:00  https://medium.com/p/the-simple-science-of-glo...   
2018-01-19 20:16:00  https://towardsdatascience.com/correlation-vs-...   
2018-01-22 14:22:00  https://medium.com/p/real-life-superpowers-c69...   
2018-01-24 18:27:00  https://towardsdatascience.com/learn-by-sharin...   

                     num_responses           publication  re