# Data Analysis for Video Game Sales 
## Karlo Mangubat
## 20 Jan 2021


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./vgsales.csv')

### Here is the dataset we are looking at:

In [3]:
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


 ### Which company is the most common video game publisher?
 

In [42]:
most_common_publisher = df['Publisher'].mode()
most_common_publisher_name = most_common_publisher
most_common_publisher_name[0]

'Electronic Arts'

### What’s the most common platform?

In [46]:
most_common_platform = df['Platform'].mode()
most_common_platform_value = most_common_platform
most_common_platform_value[0]

'DS'

### What about the most common genre?

In [48]:
most_common_genre = df['Genre'].mode()
most_common_genre_value = most_common_genre[0]
most_common_genre_value

'Action'

### What are the top 20 highest grossing games?

In [7]:
top_twenty_highest_grossing_games = df[['Name', 'Global_Sales']].head(20)
top_twenty_highest_grossing_games 

Unnamed: 0,Name,Global_Sales
0,Wii Sports,82.74
1,Super Mario Bros.,40.24
2,Mario Kart Wii,35.82
3,Wii Sports Resort,33.0
4,Pokemon Red/Pokemon Blue,31.37
5,Tetris,30.26
6,New Super Mario Bros.,30.01
7,Wii Play,29.02
8,New Super Mario Bros. Wii,28.62
9,Duck Hunt,28.31


### For North American video game sales, what’s the median?
- Provide a secondary output showing ten games surrounding the median sales output
    - assume that games with same median value are sorted in descending order

In [53]:
na_median_sales = df[['Name','NA_Sales']].sort_values('NA_Sales', ascending=False).median()
na_median_sales_value = na_median_sales[0]
na_median_sales_value

0.08

In [59]:
ten_median_na_seller_names = df[['Rank','Name','NA_Sales']].loc[df['NA_Sales'] == 0.08].sort_values('Rank', ascending=False).head(10)
ten_median_na_seller_names

Unnamed: 0,Rank,Name,NA_Sales
11492,11494,Ultimate Shooting Collection,0.08
11455,11457,The Hidden,0.08
11432,11434,DanceDanceRevolution,0.08
11431,11433,Little League World Series Baseball: Double Play,0.08
11403,11405,My English Coach: Para Hispanoparlantes,0.08
11390,11392,Super Robot Taisen OG Saga: Endless Frontier,0.08
11386,11388,Sushi Academy,0.08
11376,11378,Face Racers: Photo Finish,0.08
11375,11377,Dream Day: Wedding Destinations,0.08
11371,11373,Death Jr. and the Science Fair of Doom,0.08


### For the top-selling game of all time, how many standard deviations above/below the mean are its sales for North America?

#### First, we need to find the top-selling game:

In [9]:
top_selling_game = df['NA_Sales'].head(1)
top_selling_game[0]

41.49

#### Then, we find the average NA sales:

In [10]:
average_NA_Sales = df['NA_Sales'].mean()
average_NA_Sales

0.26466742981082064

#### Lastly, find the standard deviation for NA Sales:

In [11]:
standard_deviation_NA_Sales = df['NA_Sales'].std()
standard_deviation_NA_Sales

0.8166830292988796

#### To calculate how many standard deviations from the mean, we find the difference of the top selling game and the average sales for NA and divide by the standard deviation. So,

In [61]:
standard_deviations_above_mean = (top_selling_game - average_NA_Sales) / standard_deviation_NA_Sales

standard_deviations_above_mean_value = standard_deviations_above_mean[0]

standard_deviations_above_mean_value


50.47898767479108

### The Nintendo Wii seems to have outdone itself with games. How does its average number of sales compare with all of the other platforms?

In [67]:
compare_average = df.groupby('Platform', as_index=False).Global_Sales.mean()
compare_average


Unnamed: 0,Platform,Global_Sales
0,2600,0.729925
1,3DO,0.033333
2,3DS,0.486169
3,DC,0.307115
4,DS,0.380254
5,GB,2.606633
6,GBA,0.38747
7,GC,0.358561
8,GEN,1.05037
9,GG,0.04


### Below, we are comparing the average sales for PS4 and Wii respectively:

In [69]:
compare_average_value = compare_average.iloc[18].Global_Sales
compare_average_value >= compare_average.iloc[0].Global_Sales

True

### New Question 1: Show me the counts for each platform

In [87]:
platform_counts = df['Platform'].value_counts()
platform_counts


DS      2163
PS2     2161
PS3     1329
Wii     1325
X360    1265
PSP     1213
PS      1196
PC       960
XB       824
GBA      822
GC       556
3DS      509
PSV      413
PS4      336
N64      319
SNES     239
XOne     213
SAT      173
WiiU     143
2600     133
GB        98
NES       98
DC        52
GEN       27
NG        12
SCD        6
WS         6
3DO        3
TG16       2
GG         1
PCFX       1
Name: Platform, dtype: int64

In [88]:
platform_counts_for_x360 = platform_counts[4]
platform_counts_for_x360

1265

### New Question 2: How many Electronic Arts games ended up on the list?

In [90]:
platform_counts = df['Publisher'].value_counts()
platform_counts

Electronic Arts                 1351
Activision                       975
Namco Bandai Games               932
Ubisoft                          921
Konami Digital Entertainment     832
                                ... 
CokeM Interactive                  1
Elite                              1
Monte Christo Multimedia           1
Paon Corporation                   1
Max Five                           1
Name: Publisher, Length: 578, dtype: int64

In [92]:
platform_counts_value = platform_counts[0]
platform_counts_value

1351

### New Question 3: What is the 100th game on the list?

In [39]:
game_on_100th_spot = df.loc[df['Rank'] == 100]
game_on_100th_spot

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
99,100,Battlefield 3,X360,2011.0,Shooter,Electronic Arts,4.46,2.13,0.06,0.69,7.34


In [94]:
def test():

    def assert_equal(actual,expected):
        assert actual == expected, f"Expected {expected} but got {actual}"

    assert_equal(most_common_publisher_name[0],'Electronic Arts')
    assert_equal(most_common_platform_value[0], 'DS')
    assert_equal(most_common_genre_value, 'Action')
    assert_equal(top_twenty_highest_grossing_games.iloc[0].Name, 'Wii Sports')
    assert_equal(top_twenty_highest_grossing_games.iloc[19].Name, 'Brain Age: Train Your Brain in Minutes a Day')
    assert_equal(na_median_sales_value, 0.08)
    assert_equal(ten_median_na_seller_names.iloc[0].NA_Sales, 0.08)
    assert_equal(standard_deviations_above_mean_value, 50.47898767479108)
    assert_equal(compare_average_value >= compare_average.iloc[0].Global_Sales, True)
    assert_equal(platform_counts_for_x360, 1265)
    assert_equal(platform_counts_value, 1351)
    assert_equal(game_on_100th_spot.iloc[0].Name, 'Battlefield 3')
    
    print("Success!!!")

test()

Success!!!
