# Initialization & Data Preprocessing

In [9]:
import numpy as np
import pandas as pd
import plotly_express as px
from data_sweeper import DataSweeper # my custom class for cleaning data. It has limited functionality at the moment

df = pd.read_csv('moved_games.csv')
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
count,16713,16715,16446.0,16713,16715.0,16715.0,16715.0,16715.0,8137.0,10014,9949
unique,11559,31,,12,,,,,,96,8
top,Need for Speed: Most Wanted,PS2,,Action,,,,,,tbd,E
freq,12,2161,,3369,,,,,,2424,3990
mean,,,2006.484616,,0.263377,0.14506,0.077617,0.047342,68.967679,,
std,,,5.87705,,0.813604,0.503339,0.308853,0.186731,13.938165,,
min,,,1980.0,,0.0,0.0,0.0,0.0,13.0,,
25%,,,2003.0,,0.0,0.0,0.0,0.0,60.0,,
50%,,,2007.0,,0.08,0.02,0.0,0.01,71.0,,
75%,,,2010.0,,0.24,0.11,0.04,0.03,79.0,,


In [11]:
df.sample()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
7162,Danny Phantom: The Ultimate Enemy,GBA,2005.0,Action,0.16,0.06,0.0,0.0,,8.8,E


In [21]:
df['year_of_release'].value_counts(dropna=False)

year_of_release
2008.0    1427
2009.0    1426
2010.0    1255
2007.0    1197
2011.0    1136
2006.0    1006
2005.0     939
2002.0     829
2003.0     775
2004.0     762
2012.0     653
2015.0     606
2014.0     581
2013.0     544
2016.0     502
2001.0     482
1998.0     379
2000.0     350
1999.0     338
1997.0     289
NaN        269
1996.0     263
1995.0     219
1994.0     121
1993.0      62
1981.0      46
1992.0      43
1991.0      41
1982.0      36
1986.0      21
1989.0      17
1983.0      17
1990.0      16
1987.0      16
1988.0      15
1985.0      14
1984.0      14
1980.0       9
Name: count, dtype: int64

#### The dataset that we will be working with today was provided by the online store, ICE, which sells video games all over the world. It contains historical data of games sold on their platform up to 2016. Our goal is to seek out and identify patterns in past game realease successes (& failures) in order to (1) more accurately spot winners and avoid losers going forward and (2), optimize advertising campaign spend.

Let's take a look at each feature available to us one by one and decided if any cleaning needs to be done before beginning our analysis

1. `name`: name of video game

- Data Type: Appropriate
- <span style="color:red;">**Missing Values: two missing values. I will remove these rows since there will be minimal affect to our results**</span>

2. `platform`: the console(s) the game was sold on

- Data Type: Appropriate
- Missing Values: None
- Observations: 31 different consoles with the "PS2" being the most common

3. `year_of_release`: the year the game first hit the market

- Data Type: Sufficient
- Missing Values: Roughly 300-400 missing values
- Observations: Ranges from 1980 - 2016

4. `genre`: the video game genre

- Data Type: Appropriate
- <span style="color:red;">**Missing Values: 2 missing values. These are likely missing due to there being lack of clarity on the genre of the game**</span>
- Observations: 12 different genres are found in ICE's data with "Action" being the most common

5. `na_sales`: north american sales of the game

- Data Type: Appropriate
- Missing Values: None
- Observations: Highest amount of units sold was around 41,000,000 

6. `eu_sales`: european sales of the game

- Data Type: Appropriate
- Missing Values: None
- Observations: Highest amount of units sold was around 29,000,000

7. `jp_sales`: japanese sales of the game

- Data Type: Appropriate
- Missing Values: None
- Observations: Highest amount of units sold was around 10,000,000

8. `other_sales`: sales of the game that took place outside of the three aforementioned markets

- Data Type: Appropriate
- Missing Values: None

9. `critic_score`: score given to the game by critics

- Data Type: Appropriate
- Missing Values: Roughly half of the values are missing. We will leave them as is for now
- Observations: Critic score's ranged from 13-98 and the average score is found in the high 60s

10. `user_score`: score given to the game by consumers

- <span style="color:red;">**Data Type: Innappropriate. I will change the data type of this feature to float**</span>
- Missing Values: Roughly 6,700 values are missing. We will leave them as is for now
- Observations: The most common value is "TBD" so I will replace these by setting `errors='coerce'` when changing the data type of this column with `pd.to_numeric()` since it is unlikely that we will get these scores from the users at this point

11. `rating`: The official ESRB rating which states what age range the game is intended for

- Data Type: Appropriate
- Missing Values: Roughly 7,000 missing values. We will leave them as is for now

### Fixing Data

In [32]:
# Convering columns to lowercase & stripping them of any white space with my own module
df = DataSweeper(df).clean_columns()
df.sample()

# Changing the data type of `user_score` column to `float`
df['user_score'] = pd.to_numeric(df['user_score'], errors='coerce') 

# Confirming changes were made successfully
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16713 non-null  object 
 1   platform         16715 non-null  object 
 2   year_of_release  16446 non-null  float64
 3   genre            16713 non-null  object 
 4   na_sales         16715 non-null  float64
 5   eu_sales         16715 non-null  float64
 6   jp_sales         16715 non-null  float64
 7   other_sales      16715 non-null  float64
 8   critic_score     8137 non-null   float64
 9   user_score       7590 non-null   float64
 10  rating           9949 non-null   object 
dtypes: float64(7), object(4)
memory usage: 1.4+ MB


### Enriching Data

In [33]:
# Adding a `total_sales` column 
df['total_sales'] = df['na_sales'] + df['eu_sales'] + df['jp_sales'] + df['other_sales']

# Confirming column was created successfully
df['total_sales'].head()

0    82.54
1    40.24
2    35.52
3    32.77
4    31.38
Name: total_sales, dtype: float64

# Exploratory Data Analysis