In [30]:
# Warming Up
import pandas as pd
import numpy as np
from random import randrange

fPath = './data/vgsales.csv'
df = pd.read_csv(fPath)

# trying to create 1.5% worth of duplicates
rws = df.shape[0]
col = df.shape[1]
rwRtio = int(.015 * rws) # float is truncated

print('Number of instances before duplication = %d' % (rws))

def eenyMeeny():
     # select a random row
    tg = df.iloc[randrange(0, rws)]
    # return randomly selected row
    return tg

for i in range(rwRtio):
    df = df.append(eenyMeeny(), ignore_index=True)
rws = df.shape[0]

print('Number of instances after duplication = %d' % (rws))
print('Number of attributes = %d' % (col))
print(df.head())

248
Number of instances = 16598
Number of attributes = 11
   Rank                      Name Platform    Year         Genre Publisher  \
0     1                Wii Sports      Wii  2006.0        Sports  Nintendo   
1     2         Super Mario Bros.      NES  1985.0      Platform  Nintendo   
2     3            Mario Kart Wii      Wii  2008.0        Racing  Nintendo   
3     4         Wii Sports Resort      Wii  2009.0        Sports  Nintendo   
4     5  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  
0     41.49     29.02      3.77         8.46         82.74  
1     29.08      3.58      6.81         0.77         40.24  
2     15.85     12.88      3.79         3.31         35.82  
3     15.75     11.01      3.28         2.96         33.00  
4     11.27      8.89     10.22         1.00         31.37  


### Missing Values

In [None]:
data = df.replace('?',np.NaN)

print('Number of instances = %d' % (rws))
print('Number of attributes = %d' % (col))

print('Number of missing values:')
for col in df.columns:
    print('\t%s: %d' % (col,df[col].isna().sum()))

In [None]:
print('Number of rows in original data = %d' % (rws))
df = df.dropna()
print('Number of rows after discarding missing values = %d' % (df.shape[0]))
print('Difference in number of rows = %d' % (rws-df.shape[0]))
# update rws
rws = df.shape[0]

### Outliers
This will be for seeing the relationship between sales and a video game's genre. We will need to normalize the dataset and remove outliers by doing so.
- Author: Rahul G

#### One Hot Encoding
Start using one-hot encoding to convert the genre categorical attribute into numbers.

In [4]:
normalized = df
one_hot = pd.get_dummies(normalized['Genre'])

one_hot[20:23]

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
20,False,False,False,False,False,False,False,True,False,False,False,False
21,False,False,False,False,True,False,False,False,False,False,False,False
22,False,False,False,False,True,False,False,False,False,False,False,False


#### Dropping Columns
Will also need to drop columns that do not fit the needs of the neural network. In this case, anything that is not sales data.

In [5]:
print('Number of columns before discarding columns = %d' % (normalized.shape[1]))

normalized = normalized.drop(['Rank', 'Name', 'Platform', 'Genre', 'Year', 'Publisher'], axis=1)

print('Number of columns after discarding columns = %d' % (normalized.shape[1]))

Number of columns before discarding columns = 11
Number of columns after discarding columns = 5


#### Concatenating Rows and Columns
We want to add the one-hot encoded genres to the main dataframe.

In [6]:
normalized = pd.concat([one_hot, normalized], axis=1)

normalized[20:23]

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
20,False,False,False,False,False,False,False,True,False,False,False,False,6.42,4.52,6.04,1.37,18.36
21,False,False,False,False,True,False,False,False,False,False,False,False,10.83,2.71,4.18,0.42,18.14
22,False,False,False,False,True,False,False,False,False,False,False,False,9.54,3.44,3.84,0.46,17.28


#### Normalization
Should normalize the data to find outliers and to make it easier on the neural network.

In [7]:
normalized = (normalized-normalized.mean())/normalized.std()

normalized[20:23]

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
20,-0.499646,-0.289795,-0.23203,-0.342091,-0.237459,-0.190621,-0.285252,3.186528,-0.292717,-0.234757,-0.405707,-0.206838,7.536991,8.654076,19.277073,7.009641,11.461247
21,-0.499646,-0.289795,-0.23203,-0.342091,4.211005,-0.190621,-0.285252,-0.313802,-0.292717,-0.234757,-0.405707,-0.206838,12.936883,5.072409,13.263312,1.972216,11.31977
22,-0.499646,-0.289795,-0.23203,-0.342091,4.211005,-0.190621,-0.285252,-0.313802,-0.292717,-0.234757,-0.405707,-0.206838,11.357323,6.516949,12.164022,2.184318,10.766726


#### Removing Outliers
After getting z scores, it is now possible to find outliers and remove them.

In [9]:
print('Number of rows before discarding outliers = %d' % (normalized.shape[0]))

removed_outliers = normalized.loc[((normalized > -3).sum(axis=1)==17) & ((normalized <= 3).sum(axis=1)==17)]
print('Number of rows after discarding outliers values = %d' % (removed_outliers.shape[0]))

removed_outliers[20:23]

Number of rows before discarding outliers = 16598
Number of rows after discarding outliers values = 7216


Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
325,2.001296,-0.289795,-0.23203,-0.342091,-0.237459,-0.190621,-0.285252,-0.313802,-0.292717,-0.234757,-0.405707,-0.206838,1.904451,2.163541,1.268122,1.070782,2.078779
327,-0.499646,-0.289795,-0.23203,2.923021,-0.237459,-0.190621,-0.285252,-0.313802,-0.292717,-0.234757,-0.405707,-0.206838,2.100365,2.61867,-0.251484,1.44196,2.072348
339,-0.499646,-0.289795,-0.23203,-0.342091,-0.237459,-0.190621,-0.285252,-0.313802,-0.292717,-0.234757,2.464682,-0.206838,2.284035,2.044811,-0.219152,1.601037,2.014471


### Duplicate Data

In [None]:
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))