## Final Project

### Hypothesis
Figure out what console based on the game genre, publisher, and global sales data.

### Steps

* Identify the Problem
* Acquire the Data
* Parse the Data
* Mine the Data
* Refine the Data
* Create a Data Model
* Present the Results

In [1]:
#imports
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_raw = pd.read_csv("vgsales.csv")

In [2]:
df_raw.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
df_raw.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,,0.0,0.0,0.0,0.0,0.06
50%,8300.5,,0.08,0.02,0.0,0.01,0.17
75%,12449.75,,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [5]:
# Check the data for the years 2016, 2017, 2020, and nan
# Drop the rows that are useless

df_raw['Year'].unique()

array([ 2006.,  1985.,  2008.,  2009.,  1996.,  1989.,  1984.,  2005.,
        1999.,  2007.,  2010.,  2013.,  2004.,  1990.,  1988.,  2002.,
        2001.,  2011.,  1998.,  2015.,  2012.,  2014.,  1992.,  1997.,
        1993.,  1994.,  1982.,  2003.,  1986.,  2000.,    nan,  1995.,
        2016.,  1991.,  1981.,  1987.,  1980.,  1983.,  2020.,  2017.])

In [48]:
# Copy the data to delete the rows where there is no year or where the year is 2016 and beyond.

data = df_raw.copy()

In [49]:
# Show data where the year is 2016 and beyond

data[data.Year > 2015] 

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
221,222,FIFA 17,PS4,2016.0,Sports,Electronic Arts,0.28,3.75,0.06,0.69,4.77
271,272,Uncharted 4: A Thief's End,PS4,2016.0,Shooter,Sony Computer Entertainment,1.30,2.07,0.18,0.65,4.20
351,352,Tom Clancy's The Division,PS4,2016.0,Shooter,Ubisoft,1.28,1.61,0.15,0.57,3.61
770,772,Far Cry: Primal,PS4,2016.0,Action,Ubisoft,0.59,1.16,0.06,0.33,2.13
845,847,Tom Clancy's The Division,XOne,2016.0,Shooter,Ubisoft,1.20,0.62,0.00,0.18,2.01
1026,1028,Overwatch,PS4,2016.0,Shooter,Activision,0.64,0.68,0.14,0.26,1.73
1156,1158,No Man's Sky,PS4,2016.0,Action,Hello Games,0.58,0.74,0.02,0.26,1.60
1189,1191,Dark Souls III,PS4,2016.0,Role-Playing,Namco Bandai Games,0.58,0.44,0.33,0.21,1.56
1224,1226,FIFA 17,XOne,2016.0,Sports,Electronic Arts,0.17,1.26,0.00,0.10,1.53
1389,1391,Doom (2016),PS4,2016.0,Shooter,Bethesda Softworks,0.49,0.66,0.02,0.22,1.39


In [50]:
# Drop the data for 2016 and beyond
# This also dropped all nan.

data = data[data.Year < 2016]

In [51]:
# Confirm that I've dropped the data

data.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,15979.0,15979.0,15979.0,15979.0,15979.0,15979.0,15979.0
mean,8224.104262,2006.197071,0.26976,0.149093,0.079514,0.048892,0.547537
std,4775.81728,5.71481,0.82956,0.512557,0.314536,0.191612,1.580275
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4080.5,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8207.0,2007.0,0.08,0.02,0.0,0.01,0.18
75%,12332.5,2010.0,0.24,0.11,0.04,0.04,0.49
max,16600.0,2015.0,41.49,29.02,10.22,10.57,82.74


In [52]:
# From the above table, I realized that there are no more NaN values in Year, so I used the formulas below to just confirm they are gone.

print data['Year'].isnull().sum()
data['Year'].unique()

0


array([ 2006.,  1985.,  2008.,  2009.,  1996.,  1989.,  1984.,  2005.,
        1999.,  2007.,  2010.,  2013.,  2004.,  1990.,  1988.,  2002.,
        2001.,  2011.,  1998.,  2015.,  2012.,  2014.,  1992.,  1997.,
        1993.,  1994.,  1982.,  2003.,  1986.,  2000.,  1995.,  1991.,
        1981.,  1987.,  1980.,  1983.])

In [53]:
# Check the data dictionary for all these platforms

data['Platform'].unique()

array(['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC',
       'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16', '3DO',
       'GG', 'PCFX'], dtype=object)

In [54]:
data = data.drop(['NA_Sales','EU_Sales','JP_Sales','Other_Sales'], 1)

In [55]:
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37
