# EDA on Video Games Sales Dataset from Kaggle

jie.hu.ds@gmail.com

--------

* <a href='#Package'>1. Package</a>
* <a href='#Dataset'>2. Dataset</a>
* <a href='#Statistical Summary'>3. Statistical Summary</a>
* <a href='#Viz - Bivariate'>4. Viz - Bivariate</a>
* <a href='#Viz - Multivariate'>5. Viz - Multivariate</a>
* <a href='#Conclusion'>6. Conclusion</a>

------

<a id='Package'>Package</a>

In [56]:
# Packages
import pandas as pd
import numpy as np
import scipy as sp
import plotly
import plotly.plotly as py
from plotly.graph_objs import *
plotly.offline.init_notebook_mode(connected=True)

In [57]:
# Plotly token
plotly.tools.set_credentials_file(username='jie.hu000', api_key='O4icJnQs8RrNSuxXniAS')

<a id='Dataset'>Dataset</a>

In [58]:
df = pd.read_csv("vgsales.csv")
df[:10]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16327 non-null float64
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [60]:
df.shape

(16598, 11)

<a id='Statistical Summary'>Statistical Summary</a>

In [61]:
df.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


<a id='Viz - Bivariate'>4. Viz - Bivariate</a>

**Release vs. Platform**

In [62]:
# Platform
df.Platform = df.Platform.astype('category')
df.Platform.describe()

count     16598
unique       31
top          DS
freq       2163
Name: Platform, dtype: object

In [63]:
platform_count = df.groupby('Platform', axis=0).count().reset_index()[['Platform','Name']].sort_values(by = "Name", ascending=True)

In [64]:
# Game counts by platform

import plotly.graph_objs as go

layout = go.Layout(
    title='Total Release by Platforms',
    yaxis=dict(
        title='Platform'
    ),
    xaxis=dict(
        title='Count'
    ),
    height=700, width=700
)

trace = go.Bar(
            x=platform_count.Name,
            y=platform_count.Platform,
            orientation = 'h'
        )


fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)

**Release by Year**

In [12]:
year_count = df.groupby('Year', axis=0).count().reset_index()[['Year','Name']]
year_count.Year = year_count.Year.astype('int')

# remove data after 2016
year_count = year_count[year_count.Year <= 2016]

In [14]:
trace = go.Scatter(
    x = year_count.Year,
    y = year_count.Name,
    mode = 'lines',
    name = 'lines'
    
)


layout = go.Layout(
    title='Release by Year',
    yaxis=dict(
        title='Count'
    ),
    xaxis=dict(
        title='Year'
    ),
    height=700, width=700
)

fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)

Because the sales of new released games are still booming, the decreasing curve doesn't mean the market is decreasing

**Release by Genre**

In [70]:
genre_count = df.groupby('Genre', axis=0).count().reset_index()[['Genre','Name']].sort_values(by = "Name", ascending=True)
layout = go.Layout(
    title='Releases by Genre',

    xaxis=dict(
        title='Releases'
    ),
    height=300, width=600
    
)

trace = go.Bar(
            x=genre_count.Name,
            y=genre_count.Genre,
            orientation = 'h'
        )


fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)

Action, sports and music games took top 3 in game releases.

**Release by Publisher**

In [77]:
publisher_count = df.groupby('Publisher', axis=0).count().reset_index()[['Publisher','Name']].sort_values(by = "Name", ascending=True)
publisher_count = publisher_count.tail(n=30)
layout = go.Layout(
    title='Release by Publisher (Top 30)',

    xaxis=dict(
        title='Releases'
    ),
    height=700, width=750,
    margin=go.Margin(
        l=300,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)

trace = go.Bar(
            x=publisher_count.Name,
            y=publisher_count.Publisher,
            orientation = 'h'
        )


fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)

**Sales by Publisher**

In [80]:
publisher_sales = df.groupby('Publisher', axis=0).sum().reset_index()[['Publisher','Global_Sales']].sort_values(by = "Global_Sales", ascending=True)
publisher_sales = publisher_sales.tail(n=30)

layout = go.Layout(
    title='Sales by Publisher (Top 30)',

    xaxis=dict(
        title='Sales (in Millions)'
    ),
    height=700, width=700,
    margin=go.Margin(
        l=300,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)

trace = go.Bar(
            x=publisher_sales.Global_Sales,
            y=publisher_sales.Publisher,
            orientation = 'h'
        )


fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)

**Revenue per game by Publisher**

In [86]:
new_df = df
new_df['Game_Count'] = 1
new_df = new_df.groupby(['Publisher']).sum().reset_index()[['Publisher', 'Global_Sales','Game_Count']]
new_df['Revenue_per_game'] = new_df.Global_Sales/new_df.Game_Count

new_df = new_df.sort_values(by = "Revenue_per_game", ascending=True).\
                            tail(n=30)
layout = go.Layout(
    title='Revenue_per_game by Publisher (Top 30)',

    xaxis=dict(
        title='Revenue_per_game (in Millions)'
    ),
    height=700, width=700,
    margin=go.Margin(
        l=250,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)

trace = go.Bar(
            x=new_df.Revenue_per_game,
            y=new_df.Publisher,
            orientation = 'h'
        )


fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)

Average revenue per game shows the cashability of games published by the publishers.

**Sales by Genre**

In [22]:
sales_by_genre = df.groupby(['Genre','Name'], axis = 0).sum().reset_index()[['Genre','Name','Global_Sales']]

In [87]:
import random
from numpy import * 
genres = sales_by_genre.Genre.unique()
traces = []
c = ['hsl('+str(h)+',50%'+',50%)' for h in linspace(0, 360, len(genres))]

for i in range(len(genres)):
    genre = genres[i]
    df_genre = sales_by_genre[sales_by_genre.Genre == genre]
    trace = go.Box(
        y=np.array(df_genre.Global_Sales),
        name=genre,
        boxmean=True,
        marker={'color': c[i]}
    )
    
    traces.append(trace)

layout = go.Layout(
    title='Sales by Genre (A lot of outliers)',
    showlegend=False,
    yaxis=dict(
        title='Sales (in Millions)'
    ),
    height=700, width=700,
    margin=go.Margin(
        l=100,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)
    

fig = go.Figure(data=traces, layout=layout)
plotly.offline.iplot(fig)


In [88]:
# The outliers are like:
df.groupby(['Genre','Name'], axis = 0).\
         sum()[['Global_Sales']].\
         sort_values(by="Global_Sales", ascending = False).\
         reset_index()[:10]

Unnamed: 0,Genre,Name,Global_Sales
0,Sports,Wii Sports,82.74
1,Action,Grand Theft Auto V,55.92
2,Platform,Super Mario Bros.,45.31
3,Puzzle,Tetris,35.84
4,Racing,Mario Kart Wii,35.82
5,Sports,Wii Sports Resort,33.0
6,Role-Playing,Pokemon Red/Pokemon Blue,31.37
7,Shooter,Call of Duty: Black Ops,31.03
8,Shooter,Call of Duty: Modern Warfare 3,30.83
9,Platform,New Super Mario Bros.,30.01


In [91]:
# After delete outlier

PERCENTAGE = 0.95
traces = []

for i in range(len(genres)):
    genre = genres[i]
    df_genre = sales_by_genre[sales_by_genre.Genre == genre]
    df_genre = df_genre[df_genre.Global_Sales < df_genre.Global_Sales.quantile(PERCENTAGE)]
    
    trace = go.Box(
        y=np.array(df_genre.Global_Sales),
        name=genre,
        boxmean=True,
        marker={'color': c[i]}
    )
    
    traces.append(trace)

layout = go.Layout(
    title='Sales by Genre (Less outliers)',
    showlegend=False,
    yaxis=dict(
        title='Sales (in Millions)'
    ),
    height=700, width=700,
    margin=go.Margin(
        l=100,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)
    

fig = go.Figure(data=traces, layout=layout)
plotly.offline.iplot(fig)


Then let's check revenue distribution of TOP 1% sales of each genre

In [25]:
# After delete outlier

PERCENTAGE = 0.99
traces = []

for i in range(len(genres)):
    genre = genres[i]
    df_genre = sales_by_genre[sales_by_genre.Genre == genre]
    df_genre = df_genre[df_genre.Global_Sales > df_genre.Global_Sales.quantile(PERCENTAGE)]
    
    trace = go.Box(
        y=np.array(df_genre.Global_Sales),
        name=genre,
        boxmean=True,
        marker={'color': c[i]}
    )
    
    traces.append(trace)

layout = go.Layout(
    title='Sales by Genre (TOP 1% games)',
    showlegend=False,
    yaxis=dict(
        title='Sales (in Millions)'
    )
)
    

fig = go.Figure(data=traces, layout=layout)
plotly.offline.iplot(fig)


** Sales of games by Publisher **

In [27]:
top10_publishers = np.array(df.groupby('Publisher', axis=0).sum().\
                           reset_index()[['Publisher','Global_Sales']].\
                           sort_values(by = "Global_Sales", ascending=True).\
                           tail(n=10)['Publisher'])

top10_df = df[[pub in top10_publishers for pub in df.Publisher]]
sales_by_publisher = top10_df.groupby(['Publisher','Name']).sum().reset_index()[['Publisher','Name','Global_Sales']]

In [92]:
PERCENTAGE = 0.9
traces = []

for i in range(len(top10_publishers)):
    publisher = top10_publishers[i]
    df_pub = sales_by_publisher[sales_by_publisher.Publisher == publisher]
    df_pub = df_pub[df_pub.Global_Sales < df_pub.Global_Sales.quantile(PERCENTAGE)]
    
    trace = go.Box(
        y=np.array(df_pub.Global_Sales),
        name=publisher,
        boxmean=True,
        marker={'color': c[i]}
    )
    
    traces.append(trace)

layout = go.Layout(
    title='Sales by Publisher (Majority Games)',
    showlegend=False,
    yaxis=dict(
        title='Sales (in Millions)'
    ),
    height=700, width=700,
    margin=go.Margin(
        l=100,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)
    

fig = go.Figure(data=traces, layout=layout)
plotly.offline.iplot(fig)

However, in game industry, only top games are extremely profitable, so let's see top games of these top publishers

In [93]:
PERCENTAGE = 0.95
traces = []

for i in range(len(top10_publishers)):
    publisher = top10_publishers[i]
    df_pub = sales_by_publisher[sales_by_publisher.Publisher == publisher]
    df_pub = df_pub[df_pub.Global_Sales > df_pub.Global_Sales.quantile(PERCENTAGE)]
    
    trace = go.Box(
        y=np.array(df_pub.Global_Sales),
        name=publisher,
        boxmean=True,
        marker={'color': c[i]},
        boxpoints = 'all'
    )
    
    traces.append(trace)

layout = go.Layout(
    title='Sales by Publisher (TOP 5% Games)',
    showlegend=False,
    yaxis=dict(
        title='Sales (in Millions)'
    ),
    height=700, width=700,
    margin=go.Margin(
        l=100,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)
    

fig = go.Figure(data=traces, layout=layout)
plotly.offline.iplot(fig)

The masterpieces of Nintendo, Activision and Take-Two Interactive are more powerful in cashability.

In [30]:
sales_by_year = df.groupby('Year', axis=0).sum().reset_index()[['Year','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']]
sales_by_year.Year = sales_by_year.Year.astype('int')

In [31]:
sales_by_year = sales_by_year[sales_by_year.Year <= 2016]

In [95]:
trace_Global = go.Scatter(
    x = sales_by_year.Year,
    y = sales_by_year.Global_Sales,
    mode = 'none',
    name = 'Global_Sales',
    fill='tonexty',
)

trace_NA = go.Scatter(
    x = sales_by_year.Year,
    y = sales_by_year.NA_Sales,
    mode = 'none',
    fill='tonexty',
    name = 'NA_Sales'
)

trace_EU = go.Scatter(
    x = sales_by_year.Year,
    y = sales_by_year.EU_Sales,
    mode = 'none',
    fill='tonexty',
    name = 'EU_Sales'
)

trace_JP = go.Scatter(
    x = sales_by_year.Year,
    y = sales_by_year.JP_Sales,
    mode = 'none',
    fill='tonexty',
    name = 'JP_Sales'
)

trace_Other = go.Scatter(
    x = sales_by_year.Year,
    y = sales_by_year.Other_Sales,
    mode = 'none',
    fill='tozeroy',
    name = 'Other_Sales'
)



layout = go.Layout(
    title='Sales by Region',

    xaxis=dict(
        title='Year'
    ),
    yaxis=dict(
        title='Sales (in Millions)'
    ),
    
    height=700, width=800,
    margin=go.Margin(
        l=100,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)


fig = go.Figure(data=[trace_Other, trace_JP, trace_EU, trace_NA, trace_Global], layout=layout)
plotly.offline.iplot(fig)

- North America is always the biggest market for video games.
- Sales in other regions are booming

<a id='Viz - Multivariate'>5. Viz - Multivariate</a>

** Regional Sales by Genre across year (How genre in each region changes) **
    
I will use below function to get traces for plotly

In [98]:
# Get list of unique genres
genres = np.sort(df.Genre.unique())[::-1]

def get_traces(df, region):
    regional_df = df.groupby(['Genre','Year'], axis=0).sum().reset_index()[['Genre','Year', region]]
    years = range(1980,2018)
    
    temp_dict = {}
    for genre in genres:
        temp_dict[genre] = {}
        for year in years:
            try:
                temp_value = round(np.array(regional_df[(regional_df.Genre == genre) & 
                                   (regional_df.Year == year)][region])[0],2)
            except:
                temp_value = 0
            temp_dict[genre][year] = temp_value
    
    traces = []
    for genre in genres:
        trace = go.Bar(
            x = years,
            y = temp_dict[genre].values(),
            name=genre
        )
        traces.append(trace)
    
    return traces

*Global*

In [100]:
data = get_traces(df, 'Global_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales change in Global',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        ),
    
    height=700, width=800,
    margin=go.Margin(
        l=100,
        r=50,
        b=100,
        t=100,
        pad=4
    )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In Global market:
- Sale of Action and Shooter games are increasing
- Sale of Music, Sports, Fighting, Racing and Puzzle games are decreasing
- Much fewer revenue were generated by Strategy, Puzzle and Racing games

In *North America*

In [101]:
data = get_traces(df, 'NA_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales change in North America',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        ),
    
        height=700, width=800,
        margin=go.Margin(
            l=100,
            r=50,
            b=100,
            t=100,
            pad=4
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

North America has distribution pretty similar to Global market, because it takes up most of global sales. NA market tends to prefer Action and Shooter games to other games.

In *Japan*

In [102]:
data = get_traces(df, 'JP_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales change in Japan',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        ),
    
        height=700, width=800,
        margin=go.Margin(
            l=100,
            r=50,
            b=100,
            t=100,
            pad=4
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In Japan, besides Action, Role-Playing games attracts most revenue, which is quite different from NA market. 

In *Europe*

In [103]:
data = get_traces(df, 'EU_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales change in Europe',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        ),
    
        height=700, width=800,
        margin=go.Margin(
            l=100,
            r=50,
            b=100,
            t=100,
            pad=4
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

European people tends to have similar taste with North American players.

In *Other Regions*

In [104]:
data = get_traces(df, 'Other_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales change in Other (not JP, NA, EU)',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        ),
    
        height=700, width=800,
        margin=go.Margin(
            l=100,
            r=50,
            b=100,
            t=100,
            pad=4
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Sports and shooter games are booming in these regions.

- Sales **Percentage** of genres over time (How each market grows)

I change a little bit of the function to get traces

In [39]:
def get_percent_traces(df, region):
    temp_df = df.groupby(['Year','Genre'], axis=0).sum()[[region]]
    df_pcts = temp_df.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
    df_pcts = df_pcts.reset_index()
    regional_df = df_pcts[df_pcts.Year < 2017] 
    
    years = range(1980,2018)
    
    temp_dict = {}
    for genre in genres:
        temp_dict[genre] = {}
        for year in years:
            try:
                temp_value = round(np.array(regional_df[(regional_df.Genre == genre) & 
                                   (regional_df.Year == year)][region])[0],2)
            except:
                temp_value = 0
            temp_dict[genre][year] = temp_value
    
    
    traces = []
    for genre in genres:
        trace = go.Bar(
            x = years,
            y = temp_dict[genre].values(),
            name=genre
        )
        traces.append(trace)
    
    return traces

*Global*

In [105]:
data = get_percent_traces(df, 'Global_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales Percentage of Genres over Years in Global',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        ),
    
        height=700, width=800,
        margin=go.Margin(
            l=100,
            r=50,
            b=100,
            t=100,
            pad=4
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

By percentage of genres:
- Action and Shooter are both increasing rapidly
- Racing, puzzle, music, and strategy games are disapearing

*North America*

In [106]:
data = get_percent_traces(df, 'NA_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales Percentage of Genres over Years in North America',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        ),
    
        height=700, width=800,
        margin=go.Margin(
            l=100,
            r=50,
            b=100,
            t=100,
            pad=4
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

*Japan*

In [42]:
data = get_percent_traces(df, 'JP_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales Percentage of Genres over Years in Japan',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In Japan, RPG is always most welcome genre. And Action games are booming.

In *Europe*

In [43]:
data = get_percent_traces(df, 'EU_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales Percentage of Genres over Years in Europe',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Europe has quite similar style with NA market

In *Other regions*

In [44]:
data = get_percent_traces(df, 'Other_Sales')
layout = go.Layout(
        barmode='stack',
        title = 'Sales Percentage of Genres over Years in Other regions',
        xaxis=dict(
            title='Year'
        ),
        yaxis=dict(
            title='Sales (in Millions)'
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [45]:
len(df.Publisher.unique())

579

**Sales by genre, publisher**

In [46]:
# Prefered genres of top-5-sale publishers
genres = genres[::-1]

def get_traces_genre_publisher(region):
    top5_publishers = np.array(df.groupby('Publisher', axis=0).sum().\
                               reset_index()[['Publisher', 'Global_Sales']].\
                               sort_values(by = 'Global_Sales', ascending=True).\
                               tail(n=5)['Publisher'])

    top5_df = df[[pub in top5_publishers for pub in df.Publisher]]
    top5_genre_df = top5_df.groupby(['Publisher','Genre']).sum().reset_index()[['Publisher','Genre',region]]

    traces = []
    for i in range(len(top5_publishers)):
        publisher = top5_publishers[i]
        temp_df = top5_genre_df[top5_genre_df.Publisher == publisher]
        
       

        trace = go.Bar(
            x = genres,
            y = np.array(temp_df[region]),
            name=publisher
        )
        traces.append(trace)

    return traces



In [47]:
data = get_traces_genre_publisher('Global_Sales')
layout = go.Layout(
        xaxis=dict(tickangle=-45),
        yaxis=dict(title='Sales (in Millions)'),
        barmode='group',
        title = 'Global Sales by Genre and Publisher'
    )

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In global market:
- Nintendo focus more on Platform, RPG and Sports games
- EA focus more on Sports, shooter and racing games
- Activision earn money more on shooter games

Take a look at the top games of these publishers:

In [48]:
# Top 5 games of Nintendo
df[df.Publisher == 'Nintendo'].sort_values(by = 'Global_Sales', ascending=False)[['Publisher','Name','Global_Sales']][:5]

Unnamed: 0,Publisher,Name,Global_Sales
0,Nintendo,Wii Sports,82.74
1,Nintendo,Super Mario Bros.,40.24
2,Nintendo,Mario Kart Wii,35.82
3,Nintendo,Wii Sports Resort,33.0
4,Nintendo,Pokemon Red/Pokemon Blue,31.37


In [49]:
# Top 5 games of EA
df[df.Publisher == 'Electronic Arts'].sort_values(by = 'Global_Sales', ascending=False)[['Publisher','Name','Global_Sales']][:5]

Unnamed: 0,Publisher,Name,Global_Sales
77,Electronic Arts,FIFA 16,8.49
82,Electronic Arts,FIFA Soccer 13,8.24
83,Electronic Arts,The Sims 3,8.11
92,Electronic Arts,Star Wars Battlefront (2015),7.67
99,Electronic Arts,Battlefield 3,7.34


In [50]:
# Top 5 games of Activision
df[df.Publisher == 'Activision'].sort_values(by = 'Global_Sales', ascending=False)[['Publisher','Name','Global_Sales']][:5]

Unnamed: 0,Publisher,Name,Global_Sales
29,Activision,Call of Duty: Modern Warfare 3,14.76
31,Activision,Call of Duty: Black Ops,14.64
33,Activision,Call of Duty: Black Ops 3,14.24
34,Activision,Call of Duty: Black Ops II,14.03
35,Activision,Call of Duty: Black Ops II,13.73


In [52]:
data = get_traces_genre_publisher('NA_Sales')
layout = go.Layout(
        xaxis=dict(tickangle=-45),
        yaxis=dict(title='Sales (in Millions)'),
        barmode='group',
        title = 'North America - Sales by Genre and Publisher'
    )

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [53]:
data = get_traces_genre_publisher('JP_Sales')
layout = go.Layout(
        xaxis=dict(tickangle=-45),
        yaxis=dict(title='Sales (in Millions)'),
        barmode='group',
        title = 'Japan - Sales by Genre and Publisher'
    )

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Japan is almost taken up by its local publishers, Nintendo and SONY in all genres

In [54]:
data = get_traces_genre_publisher('EU_Sales')
layout = go.Layout(
        xaxis=dict(tickangle=-45),
        yaxis=dict(title='Sales (in Millions)'),
        barmode='group',
        title = 'Europe - Sales by Genre and Publisher'
    )

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In Europe, EA's sports games were best sold games, then Activision Shooter games

In [55]:
data = get_traces_genre_publisher('Other_Sales')
layout = go.Layout(
        xaxis=dict(tickangle=-45),
        yaxis=dict(title='Sales (in Millions)'),
        barmode='group',
        title = 'Other Regions - Sales by Genre and Publisher'
    )

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

<a id = 'Conclusion'> 6. Conclusion </a>
1. Global game market is increasing
2. North America and Europe have similar taste of games while Japan is different, with RPG taken up more marketshare
3. TOP 5 publishers are fighting at all genre, however, they have their advantageous genres