In [1]:
# Import Library
import pandas as pd 
import numpy as np 

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_csv("dataset/vgsales.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [3]:
dataset

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


### Exploration data Analysis

In [4]:
df = dataset[["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]].aggregate("sum").reset_index()
df.columns = ["Region", "Sales"]
df

Unnamed: 0,Region,Sales
0,NA_Sales,4392.95
1,EU_Sales,2434.13
2,JP_Sales,1291.02
3,Other_Sales,797.75


In [5]:
# Create Barplot
fig = px.bar(df, y="Region", x="Sales", text_auto='.4s')
fig.update_traces(marker_color=px.colors.sequential.Bluyl_r)
fig.update_layout(title="Sum of games sales by regions", xaxis_title="", yaxis_title="")
fig.show()

In [6]:
# Create Pieplot
fig = px.pie(df, values="Sales", names="Region", hole=0.5, color_discrete_sequence=px.colors.sequential.Bluyl_r)

# Update Labels
fig.update_traces(textinfo="percent")
fig.update_layout(title="Percentage of games sales by region")

# Return Values
fig.show()

In [7]:
df = dataset.groupby("Platform")[["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"]].aggregate("sum")
df = df.sort_values(by=["Global_Sales"]).reset_index().tail(5)
df

Unnamed: 0,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
26,DS,390.71,194.65,175.57,60.53,822.49
27,Wii,507.71,268.38,69.35,80.61,926.71
28,PS3,392.26,343.71,79.99,141.93,957.84
29,X360,601.05,280.58,12.43,85.54,979.96
30,PS2,583.84,339.29,139.2,193.44,1255.64


In [8]:
# Create Barplot
fig = px.bar(df, y="Platform", x="Global_Sales", text_auto='.4s')
fig.update_traces(marker_color=px.colors.sequential.Bluyl_r)
fig.update_layout(title="Best platform by regions", xaxis_title="", yaxis_title="")
fig.show()

In [9]:
df = dataset.groupby("Genre")[["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"]].aggregate("sum")
df = df.sort_values(by=["Global_Sales"]).reset_index().tail(5)
df

Unnamed: 0,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
7,Platform,447.05,201.63,130.77,51.59,831.37
8,Role-Playing,327.28,188.06,352.31,59.61,927.37
9,Shooter,582.6,313.27,38.28,102.69,1037.37
10,Sports,683.35,376.85,135.37,134.97,1330.93
11,Action,877.83,525.0,159.95,187.38,1751.18


In [10]:
# Create Barplot
fig = px.bar(df, y="Genre", x="Global_Sales", text_auto='.4s')
fig.update_traces(marker_color=px.colors.sequential.Bluyl_r)
fig.update_layout(title="Best genre by regions", xaxis_title="", yaxis_title="")
fig.show()

In [11]:
df = dataset.groupby("Publisher")[["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"]].aggregate("sum")
df = df.sort_values(by=["Global_Sales"]).reset_index().tail(5)
df

Unnamed: 0,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
573,Ubisoft,253.43,163.32,7.5,50.26,474.72
574,Sony Computer Entertainment,265.22,187.72,74.1,80.45,607.5
575,Activision,429.7,215.53,6.54,75.34,727.46
576,Electronic Arts,595.07,371.27,14.04,129.77,1110.32
577,Nintendo,816.87,418.74,455.42,95.33,1786.56


In [12]:
# Create Barplot
fig = px.bar(df, y="Publisher", x="Global_Sales", text_auto='.4s')
fig.update_traces(marker_color=px.colors.sequential.Bluyl_r)
fig.update_layout(title="Best publisher by regions", xaxis_title="", yaxis_title="")
fig.show()

In [13]:
df = dataset.groupby("Name")[["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"]].aggregate("sum")
df = df.sort_values(by=["Global_Sales"]).reset_index().tail(5)
df

Unnamed: 0,Name,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
11488,Mario Kart Wii,15.85,12.88,3.79,3.31,35.82
11489,Tetris,26.17,2.95,6.03,0.69,35.84
11490,Super Mario Bros.,32.48,4.88,6.96,0.99,45.31
11491,Grand Theft Auto V,23.46,23.04,1.39,8.03,55.92
11492,Wii Sports,41.49,29.02,3.77,8.46,82.74


In [14]:
# Create Barplot
fig = px.bar(df, y="Name", x="Global_Sales", text_auto='.4s')
fig.update_traces(marker_color=px.colors.sequential.Bluyl_r)
fig.update_layout(title="Best games by regions", xaxis_title="", yaxis_title="")
fig.show()

In [15]:
df = dataset.groupby("Year")[["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"]].aggregate("sum").reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          39 non-null     float64
 1   NA_Sales      39 non-null     float64
 2   EU_Sales      39 non-null     float64
 3   JP_Sales      39 non-null     float64
 4   Other_Sales   39 non-null     float64
 5   Global_Sales  39 non-null     float64
dtypes: float64(6)
memory usage: 2.0 KB


In [16]:
df

Unnamed: 0,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1980.0,10.59,0.67,0.0,0.12,11.38
1,1981.0,33.4,1.96,0.0,0.32,35.77
2,1982.0,26.92,1.65,0.0,0.31,28.86
3,1983.0,7.76,0.8,8.1,0.14,16.79
4,1984.0,33.28,2.1,14.27,0.7,50.36
5,1985.0,33.73,4.74,14.56,0.92,53.94
6,1986.0,12.5,2.84,19.81,1.93,37.07
7,1987.0,8.46,1.41,11.63,0.2,21.74
8,1988.0,23.87,6.59,15.76,0.99,47.22
9,1989.0,45.15,8.44,18.36,1.5,73.45


In [17]:
fig = px.line(df, x="Year", y="Global_Sales")
fig.show()