In [1]:
# !pip install plotly 

In [2]:
import pandas as pd
import plotly.express as px # import plotly express as px

In [3]:
# read in the dataset
df = pd.read_csv("https://intro-to-python-asdaf.s3.ap-southeast-2.amazonaws.com/vgsales.csv")

In [4]:
# show the df 
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# Group By

Aggregate data BY a column. 

### How?

1. Group your data by a specific column by using the `.groupby()` function. 
2. Aggregate data using one of the Pandas aggregation functions below: 
    - `count()` – Number of non-null observations
    - `sum()` – Sum of values
    - `mean()` – Mean of values
    - `median()` – Arithmetic median of values
    - `min()` – Minimum
    - `max()` – Maximum
    - `mode()` – Mode
    - `std()` – Standard deviation
    - `var()` – Variance
3. Optionally, use the `.reset_index()` function to switch the specified column from an index to a column. 

In [5]:
# 1. Use groupby
df.groupby(["Genre"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F919292390>

In [6]:
# 2. Use groupby with an aggregation function e.g. sum()
df.groupby(["Genre"]).sum()

Unnamed: 0_level_0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Action,26441383,6531731.0,877.83,525.0,159.95,187.38,1751.18
Adventure,14831165,2562375.0,105.8,64.13,52.07,16.81,239.04
Fighting,6484242,1675871.0,223.59,101.32,87.35,36.68,448.91
Misc,14889052,3432412.0,410.24,215.98,107.76,75.32,809.96
Platform,6137545,1755347.0,447.05,201.63,130.77,51.59,831.37
Puzzle,5603136,1144994.0,123.78,50.78,57.31,12.55,244.95
Racing,9943933,2457934.0,359.42,238.39,56.69,77.27,732.04
Role-Playing,12032228,2952379.0,327.28,188.06,352.31,59.61,927.37
Shooter,9653872,2571588.0,582.6,313.27,38.28,102.69,1037.37
Simulation,7478816,1707589.0,183.31,113.38,63.7,31.52,392.2


In [7]:
# 3. Use groupby with an aggregation function e.g. sum() and reset_index()
df.groupby(["Genre"]).sum().reset_index()

Unnamed: 0,Genre,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Action,26441383,6531731.0,877.83,525.0,159.95,187.38,1751.18
1,Adventure,14831165,2562375.0,105.8,64.13,52.07,16.81,239.04
2,Fighting,6484242,1675871.0,223.59,101.32,87.35,36.68,448.91
3,Misc,14889052,3432412.0,410.24,215.98,107.76,75.32,809.96
4,Platform,6137545,1755347.0,447.05,201.63,130.77,51.59,831.37
5,Puzzle,5603136,1144994.0,123.78,50.78,57.31,12.55,244.95
6,Racing,9943933,2457934.0,359.42,238.39,56.69,77.27,732.04
7,Role-Playing,12032228,2952379.0,327.28,188.06,352.31,59.61,927.37
8,Shooter,9653872,2571588.0,582.6,313.27,38.28,102.69,1037.37
9,Simulation,7478816,1707589.0,183.31,113.38,63.7,31.52,392.2


In [8]:
# save the full example to a new dataframe 
df_grouped_genre = df.groupby(["Genre"]).sum().reset_index()
df_grouped_genre

Unnamed: 0,Genre,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Action,26441383,6531731.0,877.83,525.0,159.95,187.38,1751.18
1,Adventure,14831165,2562375.0,105.8,64.13,52.07,16.81,239.04
2,Fighting,6484242,1675871.0,223.59,101.32,87.35,36.68,448.91
3,Misc,14889052,3432412.0,410.24,215.98,107.76,75.32,809.96
4,Platform,6137545,1755347.0,447.05,201.63,130.77,51.59,831.37
5,Puzzle,5603136,1144994.0,123.78,50.78,57.31,12.55,244.95
6,Racing,9943933,2457934.0,359.42,238.39,56.69,77.27,732.04
7,Role-Playing,12032228,2952379.0,327.28,188.06,352.31,59.61,927.37
8,Shooter,9653872,2571588.0,582.6,313.27,38.28,102.69,1037.37
9,Simulation,7478816,1707589.0,183.31,113.38,63.7,31.52,392.2


In [9]:
# plot out the results using a bar chart
px.bar(df_grouped_genre.sort_values(["Global_Sales"], ascending=False), x="Genre", y="Global_Sales")

In [10]:
# multiple group by clauses
df_grouped_genre_platform = df.groupby(["Genre", "Platform"]).sum().reset_index()
df_grouped_genre_platform.head()

Unnamed: 0,Genre,Platform,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Action,2600,303231,109013.0,27.39,1.64,0.0,0.22,29.34
1,Action,3DS,1765709,362389.0,19.49,12.53,22.05,2.84,57.02
2,Action,DC,27035,5999.0,0.41,0.23,0.59,0.03,1.26
3,Action,DS,3218041,674809.0,66.46,27.04,12.74,9.14,115.56
4,Action,GB,32585,11984.0,4.07,2.11,1.49,0.26,7.92


In [11]:
# plot out the results using a bar chart
px.bar(df_grouped_genre_platform.sort_values(["Global_Sales"], ascending=False), x="Platform", y="Global_Sales", color="Genre")

In [12]:
# group the data by year 
df_grouped_year = df.groupby(["Year"]).sum().reset_index()
df_grouped_year.head()

Unnamed: 0,Year,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1980.0,29826,10.59,0.67,0.0,0.12,11.38
1,1981.0,190488,33.4,1.96,0.0,0.32,35.77
2,1982.0,149186,26.92,1.65,0.0,0.31,28.86
3,1983.0,56759,7.76,0.8,8.1,0.14,16.79
4,1984.0,22911,33.28,2.1,14.27,0.7,50.36


In [13]:
# create a line plot
px.line(df_grouped_year, x="Year", y="Global_Sales")