In [1]:
# Importing the Panda DataFrame libraries

import warnings
warnings.filterwarnings('ignore')
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime 
import seaborn as sns
from scipy import stats
sns.set()

In [2]:
# Loading data into the DataFrame

df_sales = pd.read_csv('vgsales.csv')

In [3]:
# View the first five rows to know what the dataset looks like

df_sales.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


## Data Preparation
* Understanding the data variables
* Cleaning the dataset( missing data, redundant data, outliers)

In [4]:
# I want to do the characterisation of the data set: size; number of attributes; has/does not have missing values,number of observations.

# I want to know the number of attributes/ observations in the dataset

df_sales.shape

(16719, 16)

In [5]:
# Checking the data type of the attributes present in the dataset


df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB


In [6]:
# Check for missing values 

df_sales.isnull().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64

In [7]:
# Checking for duplicates

df_sales.duplicated().sum()

0

### Observations
* There are 16719 observations and 16 attributes.
* There are missing values
* The amount of missing values vary all through each columns of the dataset
* There are not duplicates in the dataset

#### Since there is not constant amount of missing values all through the columns, we will pick the column with  the missing value percentage and lowest percentage to scale. This will give us the best way to treat the missing values. Maybe to delete or fill in.


In [8]:
# scaling percentage of missing value in Critic_Score column

#calculting isnull() % for 'Critic_Score'
df_sales[df_sales['Critic_Score'].isnull()].shape[0]* 100 / len(df_sales)

51.330821221364914

In [9]:
# scaling percentage of missing value in Publisher column

#calculting isnull() % for 'Publisher'
df_sales[df_sales['Publisher'].isnull()].shape[0]* 100 / len(df_sales)

0.32298582451103536

### The percentage of missing values is varies all through the dataset.
* The ways to fill it in with KNN imputer for numerical values, fill in with mode for categorical values to avoid bias as the some columns with missing values are above 20 percent. 
* I will also look into my descriptive analysis to see how the outcome will affect the statistical results.

In [10]:
### Descriptive statistics:
df_sales.describe()

Unnamed: 0,Year_of_Release,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Count
count,16450.0,16719.0,16719.0,16719.0,16719.0,16719.0,8137.0,8137.0,7590.0
mean,2006.487356,0.26333,0.145025,0.077602,0.047332,0.533543,68.967679,26.360821,162.229908
std,5.878995,0.813514,0.503283,0.308818,0.18671,1.547935,13.938165,18.980495,561.282326
min,1980.0,0.0,0.0,0.0,0.0,0.01,13.0,3.0,4.0
25%,2003.0,0.0,0.0,0.0,0.0,0.06,60.0,12.0,10.0
50%,2007.0,0.08,0.02,0.0,0.01,0.17,71.0,21.0,24.0
75%,2010.0,0.24,0.11,0.04,0.03,0.47,79.0,36.0,81.0
max,2020.0,41.36,28.96,10.22,10.57,82.53,98.0,113.0,10665.0


In [11]:
# Filling missing values with KNN Imputer

from sklearn.impute import KNNImputer

# Identify numerical columns
numeric_columns = df_sales.select_dtypes(include=['float64']).columns

# Extract only the numerical columns for KNN imputation
df_numeric = df_sales[numeric_columns]

# Apply KNN imputation for missing values on numerical columns
imputer = KNNImputer(n_neighbors=5)
df_numeric_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)

# Replace the missing values in the original DataFrame with imputed values
df_sales[numeric_columns] = df_numeric_imputed

# Display the DataFrame after KNN imputation
df_sales.head()


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,79.2,66.4,,411.0,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,81.4,64.6,,465.4,,


In [14]:
### Now will treat categorical missing values with mode

# Identify categorical columns
categorical_columns = df_sales.select_dtypes(include=['object']).columns

# Fill missing values with the mode for categorical columns
df_sales[categorical_columns] = df_sales[categorical_columns].fillna(df_sales[categorical_columns].mode().iloc[0])

# Check for missing values 

df_sales.isnull().sum()

Name               0
Platform           0
Year_of_Release    0
Genre              0
Publisher          0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Other_Sales        0
Global_Sales       0
Critic_Score       0
Critic_Count       0
User_Score         0
User_Count         0
Developer          0
Rating             0
dtype: int64