# Data Analysis with Pandas
## Video Game Sales
### Kyle Hoac, 06-02-2021

In [482]:
import pandas as pd


In [483]:
df = pd.read_csv('./vgsales.csv')

In [484]:
df.head(20)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


### Which company is the most common video game publisher?

In [485]:
most_common_publisher = df["Publisher"].mode()[0]
most_common_publisher

'Electronic Arts'

### What’s the most common platform?

In [486]:
most_common_platform = df["Platform"].mode()[0]
most_common_platform

'DS'

### What about the most common genre?

In [487]:
most_common_genre = df["Genre"].mode()[0]
most_common_genre

'Action'

### What are the top 20 highest grossing games?

In [488]:
top_twenty_highest_grossing_games = df[["Name","Global_Sales"]].head(20)
top_twenty_highest_grossing_games

Unnamed: 0,Name,Global_Sales
0,Wii Sports,82.74
1,Super Mario Bros.,40.24
2,Mario Kart Wii,35.82
3,Wii Sports Resort,33.0
4,Pokemon Red/Pokemon Blue,31.37
5,Tetris,30.26
6,New Super Mario Bros.,30.01
7,Wii Play,29.02
8,New Super Mario Bros. Wii,28.62
9,Duck Hunt,28.31


### For North American video game sales, what’s the median?

In [489]:
na_median_sales = df[["Rank","NA_Sales"]].median()
na_median_sales[0]

8300.5

In [490]:
med = df[["Rank","NA_Sales"]].median()
sort = df.sort_values("NA_Sales")
sort[df["NA_Sales"] == 0.08]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
10630,10632,Jeopardy!,N64,1998.0,Misc,GameTek,0.08,0.02,0.00,0.00,0.10
6567,6569,Legendary,X360,2008.0,Shooter,Atari,0.08,0.15,0.00,0.03,0.26
10631,10633,Disney's Donald Duck: Goin' Quackers,N64,2000.0,Platform,Ubisoft,0.08,0.02,0.00,0.00,0.10
10073,10075,MySims SkyHeroes,PS3,2010.0,Action,Electronic Arts,0.08,0.02,0.00,0.01,0.11
10632,10634,Aidyn Chronicles: The First Mage,N64,2001.0,Role-Playing,THQ,0.08,0.02,0.00,0.00,0.10
...,...,...,...,...,...,...,...,...,...,...,...
6461,6463,Beyblade: Metal Fusion,DS,2009.0,Role-Playing,Hudson Soft,0.08,0.03,0.14,0.01,0.26
9793,9795,Atelier Annie: Alchemists of Sera Island,DS,2009.0,Role-Playing,Gust,0.08,0.00,0.03,0.01,0.12
10238,10240,Gabrielle's Ghostly Groove 3D,3DS,2011.0,Adventure,Funbox Media,0.08,0.02,0.00,0.01,0.11
8287,8289,MX World Tour Featuring Jamie Little,PS2,2005.0,Racing,Play It,0.08,0.07,0.00,0.02,0.17


### For the top-selling game of all time, how many standard deviations above/below the mean are its sales for North America?

In [491]:
max = df[["NA_Sales"]].max()[0]
std = df[["NA_Sales"]].std()[0]
mean = df[["NA_Sales"]].mean()[0]
answer = (max-mean)/std
print(answer)

50.47898767479108


In [492]:
mean_global = df[["Global_Sales"]].mean()[0]
wii = df[df["Platform"] == "Wii"]
wii_avg = wii[["Global_Sales"]].mean()[0]
sales_diff = wii_avg - mean_global
percent_diff = difference/mean_global * 100
print(f"The Wii sold {percent_diff} percent better than the global average of all plat")
print(f"Average global sales (in millions) = {mean_global}")
print(f"Average Wii global sales (in millions) = {wii_avg}")


The Wii sold 30.136000398660418 percent better than the global average of all plat
Average global sales (in millions) = 0.5374406555006628
Average Wii global sales (in millions) = 0.6994037735849057


### How does each regions sales compare to eachother?

In [493]:
na_mean = (df[["NA_Sales"]].mean()[0] * 1000000).round(2)
eu_mean = (df[["EU_Sales"]].mean()[0] * 1000000).round(2)
jp_mean = (df[["JP_Sales"]].mean()[0] * 1000000).round(2)
print(f"North America's average sales per game = ${na_mean}")
print(f"Europe's average sales per game = ${eu_mean}")
print(f"Japan's average sales per game = ${jp_mean}")

North America's average sales per game = $264667.43
Europe's average sales per game = $146652.01
Japan's average sales per game = $77781.66


In [494]:
def test():

    def assert_equal(actual,expected):
        assert actual == expected, f"Expected {expected} but got {actual}"

    assert_equal(most_common_publisher,'Electronic Arts')
    assert_equal(most_common_platform, 'DS')
    assert_equal(most_common_genre, 'Action')
    assert_equal(top_twenty_highest_grossing_games.iloc[0].Name, 'Wii Sports')
    assert_equal(top_twenty_highest_grossing_games.iloc[19].Name, 'Brain Age: Train Your Brain in Minutes a Day')
    assert_equal(na_median_sales, 8300.5)
    # assert_equal(ten_median_na_seller_names, None)

    print("Success!!!")

test()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().