### Import Libraries:

In [1]:
import pandas as pd
import numpy as np
import janitor

In [2]:
!pwd

/Users/Natifu/github_projects/python_games/cleaning_script


### Load in Data:

In [3]:
ps4_raw = pd.read_csv("../raw_data/ps4-game-sales.csv", encoding = "ISO-8859-1")

In [4]:
xbox_raw = pd.read_csv("../raw_data/xbox-one-game-sales.csv", encoding = "ISO-8859-1")

In [5]:
sales2016_raw = pd.read_csv("../raw_data/sales-2016-with-ratings.csv", encoding = "ISO-8859-1")

In [6]:
sales2019_raw = pd.read_csv("../raw_data/sales-2019.csv", encoding = "ISO-8859-1")

# 2016 Sales

### Check for missing values:

In [7]:
sales2016_raw.isnull().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64

### Replace missing values:

In [8]:
sales2016_raw["Name"].fillna("Unknown", inplace = True)
sales2016_raw["Year_of_Release"].fillna("Unknown", inplace = True)
sales2016_raw["Genre"].fillna("Other", inplace = True)
sales2016_raw["Publisher"].fillna("Unknown", inplace = True)

sales2016_raw.isnull().sum()

Name                  0
Platform              0
Year_of_Release       0
Genre                 0
Publisher             0
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64

### Drop unwanted columns:

In [9]:
sales2016_clean = sales2016_raw.drop(["Critic_Score", "Critic_Count", "User_Score", "User_Count", "Developer", "Rating"], 
                   axis = 1)

sales2016_clean.isnull().sum()

Name               0
Platform           0
Year_of_Release    0
Genre              0
Publisher          0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Other_Sales        0
Global_Sales       0
dtype: int64

### Clean column names:

In [10]:
sales2016_clean = sales2016_clean.clean_names()

### Rename platform column to 'console':

In [11]:
sales2016_clean.rename({"platform" : "console"}, 
                        axis = "columns",
                        inplace = True)

In [12]:
sales2016_clean["genre"].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy', 'Other'], dtype=object)

### Tidy up genre names:

In [13]:
sales2016_clean.replace({"genre" : {"Misc": "Other",
                                    "Action": "Action-Adventure",
                                    "Adventure": "Action-Adventure",
                                    "Strategy": "Other"}}, 
                                    inplace = True)

sales2016_clean["genre"].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Other',
       'Shooter', 'Simulation', 'Action-Adventure', 'Fighting'],
      dtype=object)

### Remove duplicates:

In [65]:
df = sales2016_clean.groupby(['name', 'console']).size().reset_index(name = 'count').sort_values('count', ascending = False)

df.loc[df[['count']] > 1]

ValueError: Cannot index with multidimensional key

In [39]:
sales2016_clean.loc[sales2016_clean.duplicated(subset='name')].sort_values('name')

Unnamed: 0,name,console,year_of_release,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
3862,Frozen: Olaf's Quest,DS,2013.0,Platform,Disney Interactive Studios,0.21,0.26,0.00,0.04,0.52
14660,007: Quantum of Solace,PC,2008.0,Action-Adventure,Activision,0.01,0.01,0.00,0.00,0.03
1785,007: Quantum of Solace,PS3,2008.0,Action-Adventure,Activision,0.43,0.51,0.02,0.19,1.14
3120,007: Quantum of Solace,Wii,2008.0,Action-Adventure,Activision,0.29,0.28,0.01,0.07,0.65
4475,007: Quantum of Solace,PS2,2008.0,Action-Adventure,Activision,0.17,0.00,0.00,0.26,0.43
...,...,...,...,...,...,...,...,...,...,...
12649,pro evolution soccer 2011,PC,2010.0,Sports,Konami Digital Entertainment,0.00,0.05,0.00,0.01,0.06
2583,pro evolution soccer 2011,PSP,2010.0,Sports,Konami Digital Entertainment,0.05,0.30,0.29,0.16,0.79
4664,pro evolution soccer 2011,PS2,2010.0,Sports,Konami Digital Entertainment,0.04,0.21,0.05,0.11,0.41
3304,pro evolution soccer 2011,X360,2010.0,Sports,Konami Digital Entertainment,0.09,0.44,0.00,0.07,0.61


# 2019 Sales

### Check for missing values:

In [15]:

(
sales2019_raw
    .isnull()
    .sum()
)


Rank                  0
Name                  0
basename              0
Genre                 0
ESRB_Rating       32169
Platform              0
Publisher             0
Developer            17
VGChartz_Score    55792
Critic_Score      49256
User_Score        55457
Total_Shipped     53965
Global_Sales      36377
NA_Sales          42828
PAL_Sales         42603
JP_Sales          48749
Other_Sales       40270
Year                979
Last_Update       46606
url                   0
status                0
Vgchartzscore     54993
img_url               0
dtype: int64

### Clean names:

In [16]:
sales2019_clean = sales2019_raw.clean_names()

In [17]:
list(sales2016_clean)

['name',
 'console',
 'year_of_release',
 'genre',
 'publisher',
 'na_sales',
 'eu_sales',
 'jp_sales',
 'other_sales',
 'global_sales']

In [18]:
list(sales2019_clean)

['rank',
 'name',
 'basename',
 'genre',
 'esrb_rating',
 'platform',
 'publisher',
 'developer',
 'vgchartz_score',
 'critic_score',
 'user_score',
 'total_shipped',
 'global_sales',
 'na_sales',
 'pal_sales',
 'jp_sales',
 'other_sales',
 'year',
 'last_update',
 'url',
 'status',
 'vgchartzscore',
 'img_url']

### Select relevant columns:

In [19]:
sales2019_clean = sales2019_clean[["name", "platform", "year", "genre", "publisher", "na_sales", "pal_sales", "jp_sales", "other_sales", "global_sales"]]

list(sales2019_clean)

['name',
 'platform',
 'year',
 'genre',
 'publisher',
 'na_sales',
 'pal_sales',
 'jp_sales',
 'other_sales',
 'global_sales']

### Change column names:

In [20]:
sales2019_clean.rename({"platform" : "console",
                        "year" : "year_of_release",
                        "pal_sales" : "eu_sales"}, 
                        axis = "columns",
                        inplace = True)
list(sales2019_clean)

['name',
 'console',
 'year_of_release',
 'genre',
 'publisher',
 'na_sales',
 'eu_sales',
 'jp_sales',
 'other_sales',
 'global_sales']

### Tidy up genre names:

In [21]:
sales2019_clean.replace({"genre" : {"Misc" : "Other",
                                    "Party" : "Other",
                                    "Action" : "Action-Adventure",
                                    "Strategy" : "Other",
                                    "Adventure" : "Action-Adventure",
                                    "MMO" : "Other",
                                    "Sandbox" : "Other",
                                    "Visual Novel" : "Other",
                                    "Board Game" : "Other",
                                    "Education" : "Other"}},
                                    inplace = True)

sales2019_clean["genre"].unique()



array(['Sports', 'Platform', 'Racing', 'Shooter', 'Role-Playing',
       'Puzzle', 'Other', 'Simulation', 'Action-Adventure', 'Fighting',
       'Music'], dtype=object)

### Check for missing values:

In [22]:
(
    sales2019_clean
    .isnull()
    .sum()
)

name                   0
console                0
year_of_release      979
genre                  0
publisher              0
na_sales           42828
eu_sales           42603
jp_sales           48749
other_sales        40270
global_sales       36377
dtype: int64

In [23]:
sales2019_clean["year_of_release"].fillna("Unknown", inplace = True)


sales2019_clean.isnull().sum()

name                   0
console                0
year_of_release        0
genre                  0
publisher              0
na_sales           42828
eu_sales           42603
jp_sales           48749
other_sales        40270
global_sales       36377
dtype: int64

In [24]:
sales2019_clean.dropna(inplace = True)

sales2019_clean.isnull().sum()

name               0
console            0
year_of_release    0
genre              0
publisher          0
na_sales           0
eu_sales           0
jp_sales           0
other_sales        0
global_sales       0
dtype: int64