In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Create a path to the csv and read it into a Pandas DataFrame
csv_path = "ted_talks.csv"
ted_df = pd.read_csv(csv_path)

ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869


In [13]:
# Figure out the minimum and maximum views for a TED Talk
ted_max = ted_df["views"].max()
ted_min = ted_df["views"].min()
print("Ted Talk Most Views : " + str(ted_max))
print("Ted Talk Least Views : " + str(ted_min))

Ted Talk Most Views : 47227110
Ted Talk Least Views : 50443


In [19]:
# Create bins in which to place values based upon TED Talk views
bins = [0, 10000, 100000, 250000, 500000, 1000000, 5000000, 10000000, 100000000, 500000000]

# Create labels for these bins
group_labels = ["0 to 10K", "10K to 100K", "100K to 250K", "250K to 500K", "500K to 1M", "1M to 5M", "5M to 10M", "10M to 100M", "100M to 500M"]

In [24]:
# Slice the data and place it into bins
ted_bin = pd.cut(ted_df["views"], bins, labels=group_labels).head()
print(type(ted_bin))

<class 'pandas.core.series.Series'>


In [25]:
# Place the data series into a new column inside of the DataFrame
# create a new column to add series in 
ted_df["View Group"] = pd.cut(ted_df["views"], bins, labels=group_labels)
ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views,View Group
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110,10M to 100M
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520,1M to 5M
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292,1M to 5M
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550,1M to 5M
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869,10M to 100M


In [26]:
# Create a GroupBy object based upon "View Group"
ted_group = ted_df.groupby("View Group")

# Find how many rows fall into each bin
# doesn't matter what column you count
print(ted_group["comments"].count())

# Get the average of each column within the GroupBy object
# average number of comments in those groups and we grouped by the View Group
ted_group[["comments", "duration", "languages"]].mean()

View Group
0 to 10K           0
10K to 100K        3
100K to 250K      48
250K to 500K     228
500K to 1M       768
1M to 5M        1404
5M to 10M         65
10M to 100M       34
100M to 500M       0
Name: comments, dtype: int64


Unnamed: 0_level_0,comments,duration,languages
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 to 10K,,,
10K to 100K,35.333333,1591.666667,0.0
100K to 250K,70.833333,835.25,7.9375
250K to 500K,94.442982,822.864035,21.289474
500K to 1M,118.005208,830.21875,24.733073
1M to 5M,219.678063,819.066239,29.5349
5M to 10M,469.153846,842.969231,38.569231
10M to 100M,996.882353,963.264706,43.470588
100M to 500M,,,
