### Understanding Feature Engineering -Categorical Data (Article in Towards Data Science)


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling

from mlsettings.settings import load_app_config, get_datafolder_path
 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 
np.set_printoptions(precision=4)

pd.set_option('display.width', 200)
pd.set_option('precision', 4)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")
pd.options.display.float_format = '{:,.4f}'.format
sns.set()
import logging
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
logger.setLevel(logging.DEBUG)


In [3]:
load_app_config()
DATA_DIRECTORY='vgsales'
TRAIN_FILE = 'vgsales.csv'
input_path = get_datafolder_path()

In [4]:
file_path = pathlib.Path(input_path).joinpath(DATA_DIRECTORY)
input_file = file_path.joinpath(TRAIN_FILE)

In [5]:
vg_df = pd.read_csv(input_file,encoding='utf-8')

In [6]:
#pandas_profiling.ProfileReport(vg_df)

#### Transforming Nominal Attributes
Nominal attributes consist of discrete categorical values with no notion or sense of order amongst them. The idea here is to transform these attributes into a more representative numerical format which can be easily understood by downstream code and pipelines.

In [7]:
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher
0,Wii Sports,Wii,2006.0,Sports,Nintendo
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo


In [8]:
vg_df['Genre'].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [9]:
from sklearn.preprocessing  import LabelEncoder
label_encoder  = LabelEncoder()
genre_labels = label_encoder.fit_transform(vg_df['Genre'])
genre_mappings  = { index:label for index,label in enumerate(label_encoder.classes_)}
genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [10]:
vg_df['GenreLabel'] =genre_labels
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,GenreLabel
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,10
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,4
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,6
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,10
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,7


#### Transforming Ordinal Attributes

In [11]:
DATA_DIRECTORY2='pokemon'
TRAIN_FILE2 = 'pokemon.csv'

In [12]:
file_path = pathlib.Path(input_path).joinpath(DATA_DIRECTORY2)
input_file = file_path.joinpath(TRAIN_FILE2)

In [13]:
pokemon_df = pd.read_csv(input_file,encoding='utf-8')

In [14]:
#pandas_profiling.ProfileReport(pokemon_df)

In [15]:
gen_map = { gen:idx+1 for idx ,gen in enumerate(sorted(pokemon_df['Generation'].unique()))}

In [16]:
gen_map

{'GEN 1': 1, 'GEN 2': 2, 'GEN 3': 3, 'GEN 4': 4, 'GEN 5': 5, 'GEN 6': 6}

In [17]:
pokemon_df['Generation_label'] = pokemon_df['Generation'].map(gen_map)

In [18]:
pokemon_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Generation_label
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,GEN 1,False,1
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,GEN 1,False,1
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,GEN 1,False,1
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,GEN 1,False,1
4,4,Charmander,Fire,,309,39,52,43,60,50,65,GEN 1,False,1


#### Encoding Categorical Attributes

##### For columns with text attributes

In [19]:
from sklearn import feature_extraction
def one_hot_dataframe(data,columns,replace=False):
    fe_vec= feature_extraction.DictVectorizer()
    make_dict = lambda row :dict((column,row[column]) for column in  columns)
    vector_data=pd.DataFrame(fe_vec.fit_transform(
                             data[columns].apply(make_dict, axis=1)).toarray())
    vector_data.columns = fe_vec.get_feature_names()
    vector_data.index= data.index
    if replace:
        data = data.drop(columns, axis=1)
        data = data.join(vector_data)
    return data,vector_data

In [20]:
pokemon_df,pokemon_df_ignore = one_hot_dataframe(pokemon_df, ['Generation'], replace=True)

In [21]:
pokemon_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Generation_label,Generation=GEN 1,Generation=GEN 2,Generation=GEN 3,Generation=GEN 4,Generation=GEN 5,Generation=GEN 6
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,False,1,1.0,0.0,0.0,0.0,0.0,0.0
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,False,1,1.0,0.0,0.0,0.0,0.0,0.0
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,False,1,1.0,0.0,0.0,0.0,0.0,0.0
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,False,1,1.0,0.0,0.0,0.0,0.0,0.0
4,4,Charmander,Fire,,309,39,52,43,60,50,65,False,1,1.0,0.0,0.0,0.0,0.0,0.0


##### For columns with numeric or boolean attributes

In [22]:
gen_dummy_features = pd.get_dummies(pokemon_df['Legendary'],prefix ='Legend')

In [23]:
pokemon_df = pd.concat([pokemon_df, gen_dummy_features], axis=1)
pokemon_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Generation_label,Generation=GEN 1,Generation=GEN 2,Generation=GEN 3,Generation=GEN 4,Generation=GEN 5,Generation=GEN 6,Legend_False,Legend_True
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,False,1,1.0,0.0,0.0,0.0,0.0,0.0,1,0
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,False,1,1.0,0.0,0.0,0.0,0.0,0.0,1,0
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,False,1,1.0,0.0,0.0,0.0,0.0,0.0,1,0
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,False,1,1.0,0.0,0.0,0.0,0.0,0.0,1,0
4,4,Charmander,Fire,,309,39,52,43,60,50,65,False,1,1.0,0.0,0.0,0.0,0.0,0.0,1,0


##### Effect Coding Scheme

The effect coding scheme is actually very similar to the dummy coding scheme,except during the encoding process, the encoded features or feature vector, for the category values which represent all 0 in the dummy coding scheme, is replaced by -1 in the effect coding scheme.

#### Curse of Dimensionality

The encoding schemes we discussed so far, work quite well on categorical data in general, but they start causing problems when the number of distinct categories in any feature becomes very large. Essential for any categorical feature of m distinct labels, you get m separate features. This can easily increase the size of the feature set causing problems like storage issues, model training problems with regard to time, space and memory. Besides this, we also have to deal with what is popularly known as the **‘curse of dimensionality’** where basically with an enormous number of features and not enough representative samples, model performance starts getting affected often leading to overfitting.

##### Bin counting scheme
The bin-counting scheme is a useful scheme for dealing with categorical variables having many categories. In this scheme, instead of using the actual label values for encoding, we use probability based statistical information about the value and the actual target or response value which we aim to predict in our modeling efforts.

##### Feature Hashing Scheme
In this scheme, a hash function is typically used with the number of encoded features pre-set (as a vector of pre-defined length) such that the hashed values of the features are used as indices in this pre-defined vector and values are updated accordingly. 

Since a hash function maps a large number of values into a small finite set of values, multiple different values might create the same hash which is termed as collisions.

Typically, a signed hash function is used so that the sign of the value obtained from the hash is used as the sign of the value which is stored in the final feature vector at the appropriate index. This should ensure lesser collisions and lesser accumulation of error due to collisions.


In [24]:
print("Total game genres:", len(vg_df['Genre'].unique()))
vg_df['Genre'].unique()

Total game genres: 12


array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [25]:
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
hashed_features = pd.DataFrame(hashed_features)
hashed_features = hashed_features.add_prefix('Genre') 
vg_df = pd.concat([vg_df,hashed_features], axis=1)
vg_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,GenreLabel,Genre0,Genre1,Genre2,Genre3,Genre4,Genre5
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,10,-2.0,2.0,0.0,-2.0,0.0,0.0
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,4,0.0,2.0,2.0,-1.0,1.0,0.0
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,6,-1.0,0.0,0.0,0.0,0.0,-1.0
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,10,-2.0,2.0,0.0,-2.0,0.0,0.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,7,-1.0,1.0,2.0,0.0,1.0,-1.0


In [26]:
pandas_profiling.ProfileReport(vg_df)

0,1
Number of variables,18
Number of observations,16598
Total Missing (%),0.1%
Total size in memory,2.2 MiB
Average record size in memory,140.0 B

0,1
Numeric,12
Categorical,4
Boolean,1
Date,0
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Distinct count,305
Unique (%),1.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.14665
Minimum,0
Maximum,29.02
Zeros (%),34.5%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.02
Q3,0.11
95-th percentile,0.63
Maximum,29.02
Range,29.02
Interquartile range,0.11

0,1
Standard deviation,0.50535
Coef of variation,3.4459
Kurtosis,756.03
Mean,0.14665
MAD,0.18994
Skewness,18.876
Sum,2434.1
Variance,0.25538
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,5730,34.5%,
0.01,1496,9.0%,
0.02,1269,7.6%,
0.03,934,5.6%,
0.04,748,4.5%,
0.05,546,3.3%,
0.06,412,2.5%,
0.07,384,2.3%,
0.08,302,1.8%,
0.09,256,1.5%,

Value,Count,Frequency (%),Unnamed: 3
0.0,5730,34.5%,
0.01,1496,9.0%,
0.02,1269,7.6%,
0.03,934,5.6%,
0.04,748,4.5%,

Value,Count,Frequency (%),Unnamed: 3
9.27,1,0.0%,
11.0,1,0.0%,
11.01,1,0.0%,
12.88,1,0.0%,
29.02,1,0.0%,

0,1
Distinct count,12
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Action,3316
Sports,2346
Misc,1739
Other values (9),9197

Value,Count,Frequency (%),Unnamed: 3
Action,3316,20.0%,
Sports,2346,14.1%,
Misc,1739,10.5%,
Role-Playing,1488,9.0%,
Shooter,1310,7.9%,
Adventure,1286,7.7%,
Racing,1249,7.5%,
Platform,886,5.3%,
Simulation,867,5.2%,
Fighting,848,5.1%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-1.0004
Minimum,-2
Maximum,0
Zeros (%),19.3%

0,1
Minimum,-2
5-th percentile,-2
Q1,-1
Median,-1
Q3,-1
95-th percentile,0
Maximum,0
Range,2
Interquartile range,0

0,1
Standard deviation,0.62195
Coef of variation,-0.62172
Kurtosis,-0.4144
Mean,-1.0004
MAD,0.38702
Skewness,0.00024103
Sum,-16604
Variance,0.38682
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
-1.0,10178,61.3%,
-2.0,3213,19.4%,
0.0,3207,19.3%,

Value,Count,Frequency (%),Unnamed: 3
-2.0,3213,19.4%,
-1.0,10178,61.3%,
0.0,3207,19.3%,

Value,Count,Frequency (%),Unnamed: 3
-2.0,3213,19.4%,
-1.0,10178,61.3%,
0.0,3207,19.3%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.4739
Minimum,0
Maximum,4
Zeros (%),23.1%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,2
95-th percentile,4
Maximum,4
Range,4
Interquartile range,1

0,1
Standard deviation,1.2845
Coef of variation,0.87156
Kurtosis,-0.24596
Mean,1.4739
MAD,1.0383
Skewness,0.83788
Sum,24463
Variance,1.6501
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,6253,37.7%,
2.0,3913,23.6%,
0.0,3836,23.1%,
4.0,2596,15.6%,

Value,Count,Frequency (%),Unnamed: 3
0.0,3836,23.1%,
1.0,6253,37.7%,
2.0,3913,23.6%,
4.0,2596,15.6%,

Value,Count,Frequency (%),Unnamed: 3
0.0,3836,23.1%,
1.0,6253,37.7%,
2.0,3913,23.6%,
4.0,2596,15.6%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.27232
Minimum,-3
Maximum,2
Zeros (%),71.9%

0,1
Minimum,-3
5-th percentile,-3
Q1,0
Median,0
Q3,0
95-th percentile,2
Maximum,2
Range,5
Interquartile range,0

0,1
Standard deviation,1.0963
Coef of variation,4.0257
Kurtosis,2.2713
Mean,0.27232
MAD,0.72574
Skewness,-0.59534
Sum,4520
Variance,1.2019
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,11927,71.9%,
2.0,3241,19.5%,
-3.0,848,5.1%,
1.0,582,3.5%,

Value,Count,Frequency (%),Unnamed: 3
-3.0,848,5.1%,
0.0,11927,71.9%,
1.0,582,3.5%,
2.0,3241,19.5%,

Value,Count,Frequency (%),Unnamed: 3
-3.0,848,5.1%,
0.0,11927,71.9%,
1.0,582,3.5%,
2.0,3241,19.5%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-1.5637
Minimum,-3
Maximum,0
Zeros (%),16.5%

0,1
Minimum,-3
5-th percentile,-3
Q1,-2
Median,-2
Q3,-1
95-th percentile,0
Maximum,0
Range,3
Interquartile range,1

0,1
Standard deviation,0.88455
Coef of variation,-0.56566
Kurtosis,-0.60526
Mean,-1.5637
MAD,0.75225
Skewness,0.4605
Sum,-25955
Variance,0.78243
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
-2.0,8664,52.2%,
-1.0,3482,21.0%,
0.0,2737,16.5%,
-3.0,1715,10.3%,

Value,Count,Frequency (%),Unnamed: 3
-3.0,1715,10.3%,
-2.0,8664,52.2%,
-1.0,3482,21.0%,
0.0,2737,16.5%,

Value,Count,Frequency (%),Unnamed: 3
-3.0,1715,10.3%,
-2.0,8664,52.2%,
-1.0,3482,21.0%,
0.0,2737,16.5%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.78311
Minimum,0
Maximum,2
Zeros (%),34.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,2
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.65521
Coef of variation,0.83669
Kurtosis,-0.74267
Mean,0.78311
MAD,0.54286
Skewness,0.25827
Sum,12998
Variance,0.42931
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,8692,52.4%,
0.0,5753,34.7%,
2.0,2153,13.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,5753,34.7%,
1.0,8692,52.4%,
2.0,2153,13.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,5753,34.7%,
1.0,8692,52.4%,
2.0,2153,13.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,-0.71201

0,1
-1.0,11818
0.0,4780

Value,Count,Frequency (%),Unnamed: 3
-1.0,11818,71.2%,
0.0,4780,28.8%,

0,1
Distinct count,12
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,4.9281
Minimum,0
Maximum,11
Zeros (%),20.0%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,5
Q3,8
95-th percentile,10
Maximum,11
Range,11
Interquartile range,7

0,1
Standard deviation,3.762
Coef of variation,0.76338
Kurtosis,-1.4324
Mean,4.9281
MAD,3.3801
Skewness,0.068489
Sum,81797
Variance,14.153
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
0,3316,20.0%,
10,2346,14.1%,
3,1739,10.5%,
7,1488,9.0%,
8,1310,7.9%,
1,1286,7.7%,
6,1249,7.5%,
4,886,5.3%,
9,867,5.2%,
2,848,5.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3316,20.0%,
1,1286,7.7%,
2,848,5.1%,
3,1739,10.5%,
4,886,5.3%,

Value,Count,Frequency (%),Unnamed: 3
7,1488,9.0%,
8,1310,7.9%,
9,867,5.2%,
10,2346,14.1%,
11,681,4.1%,

0,1
Correlation,0.90284

0,1
Distinct count,244
Unique (%),1.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.077782
Minimum,0
Maximum,10.22
Zeros (%),63.0%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,0.04
95-th percentile,0.36
Maximum,10.22
Range,10.22
Interquartile range,0.04

0,1
Standard deviation,0.30929
Coef of variation,3.9764
Kurtosis,194.23
Mean,0.077782
MAD,0.11557
Skewness,11.206
Sum,1291
Variance,0.095661
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,10455,63.0%,
0.02,728,4.4%,
0.01,704,4.2%,
0.03,528,3.2%,
0.04,397,2.4%,
0.05,321,1.9%,
0.06,290,1.7%,
0.07,231,1.4%,
0.08,211,1.3%,
0.09,156,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0.0,10455,63.0%,
0.01,704,4.2%,
0.02,728,4.4%,
0.03,528,3.2%,
0.04,397,2.4%,

Value,Count,Frequency (%),Unnamed: 3
6.04,1,0.0%,
6.5,1,0.0%,
6.81,1,0.0%,
7.2,1,0.0%,
10.22,1,0.0%,

0,1
Distinct count,409
Unique (%),2.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.26467
Minimum,0
Maximum,41.49
Zeros (%),27.1%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.08
Q3,0.24
95-th percentile,1.06
Maximum,41.49
Range,41.49
Interquartile range,0.24

0,1
Standard deviation,0.81668
Coef of variation,3.0857
Kurtosis,649.13
Mean,0.26467
MAD,0.30816
Skewness,18.8
Sum,4393
Variance,0.66697
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,4499,27.1%,
0.02,550,3.3%,
0.01,541,3.3%,
0.03,533,3.2%,
0.05,530,3.2%,
0.04,525,3.2%,
0.06,495,3.0%,
0.07,477,2.9%,
0.08,459,2.8%,
0.09,419,2.5%,

Value,Count,Frequency (%),Unnamed: 3
0.0,4499,27.1%,
0.01,541,3.3%,
0.02,550,3.3%,
0.03,533,3.2%,
0.04,525,3.2%,

Value,Count,Frequency (%),Unnamed: 3
15.85,1,0.0%,
23.2,1,0.0%,
26.93,1,0.0%,
29.08,1,0.0%,
41.49,1,0.0%,

0,1
Distinct count,11493
Unique (%),69.2%
Missing (%),0.0%
Missing (n),0

0,1
Need for Speed: Most Wanted,12
LEGO Marvel Super Heroes,9
FIFA 14,9
Other values (11490),16568

Value,Count,Frequency (%),Unnamed: 3
Need for Speed: Most Wanted,12,0.1%,
LEGO Marvel Super Heroes,9,0.1%,
FIFA 14,9,0.1%,
Madden NFL 07,9,0.1%,
Ratatouille,9,0.1%,
FIFA 15,8,0.0%,
Terraria,8,0.0%,
LEGO Harry Potter: Years 5-7,8,0.0%,
LEGO The Hobbit,8,0.0%,
Monopoly,8,0.0%,

0,1
Distinct count,157
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.048063
Minimum,0
Maximum,10.57
Zeros (%),39.0%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.01
Q3,0.04
95-th percentile,0.2
Maximum,10.57
Range,10.57
Interquartile range,0.04

0,1
Standard deviation,0.18859
Coef of variation,3.9238
Kurtosis,1025.3
Mean,0.048063
MAD,0.061352
Skewness,24.234
Sum,797.75
Variance,0.035566
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,6477,39.0%,
0.01,3445,20.8%,
0.02,1582,9.5%,
0.03,939,5.7%,
0.04,666,4.0%,
0.05,488,2.9%,
0.06,401,2.4%,
0.07,336,2.0%,
0.08,239,1.4%,
0.09,193,1.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,6477,39.0%,
0.01,3445,20.8%,
0.02,1582,9.5%,
0.03,939,5.7%,
0.04,666,4.0%,

Value,Count,Frequency (%),Unnamed: 3
3.31,1,0.0%,
4.14,1,0.0%,
7.53,1,0.0%,
8.46,1,0.0%,
10.57,1,0.0%,

0,1
Distinct count,31
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
DS,2163
PS2,2161
PS3,1329
Other values (28),10945

Value,Count,Frequency (%),Unnamed: 3
DS,2163,13.0%,
PS2,2161,13.0%,
PS3,1329,8.0%,
Wii,1325,8.0%,
X360,1265,7.6%,
PSP,1213,7.3%,
PS,1196,7.2%,
PC,960,5.8%,
XB,824,5.0%,
GBA,822,5.0%,

0,1
Distinct count,579
Unique (%),3.5%
Missing (%),0.3%
Missing (n),58

0,1
Electronic Arts,1351
Activision,975
Namco Bandai Games,932
Other values (575),13282

Value,Count,Frequency (%),Unnamed: 3
Electronic Arts,1351,8.1%,
Activision,975,5.9%,
Namco Bandai Games,932,5.6%,
Ubisoft,921,5.5%,
Konami Digital Entertainment,832,5.0%,
THQ,715,4.3%,
Nintendo,703,4.2%,
Sony Computer Entertainment,683,4.1%,
Sega,639,3.8%,
Take-Two Interactive,413,2.5%,

0,1
Distinct count,16598
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,8300.6
Minimum,1
Maximum,16600
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,831.85
Q1,4151.2
Median,8300.5
Q3,12450.0
95-th percentile,15770.0
Maximum,16600.0
Range,16599.0
Interquartile range,8298.5

0,1
Standard deviation,4791.9
Coef of variation,0.57729
Kurtosis,-1.1999
Mean,8300.6
MAD,4149.7
Skewness,6.6497e-05
Sum,137773446
Variance,22962000
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
10864,1,0.0%,
6822,1,0.0%,
677,1,0.0%,
2724,1,0.0%,
12963,1,0.0%,
15010,1,0.0%,
8865,1,0.0%,
10912,1,0.0%,
4759,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,
5,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
16596,1,0.0%,
16597,1,0.0%,
16598,1,0.0%,
16599,1,0.0%,
16600,1,0.0%,

0,1
Distinct count,40
Unique (%),0.2%
Missing (%),1.6%
Missing (n),271
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2006.4
Minimum,1980
Maximum,2020
Zeros (%),0.0%

0,1
Minimum,1980
5-th percentile,1996
Q1,2003
Median,2007
Q3,2010
95-th percentile,2015
Maximum,2020
Range,40
Interquartile range,7

0,1
Standard deviation,5.829
Coef of variation,0.0029052
Kurtosis,1.8482
Mean,2006.4
MAD,4.5046
Skewness,-1.0026
Sum,32759000
Variance,33.977
Memory size,129.8 KiB

Value,Count,Frequency (%),Unnamed: 3
2009.0,1431,8.6%,
2008.0,1428,8.6%,
2010.0,1259,7.6%,
2007.0,1202,7.2%,
2011.0,1139,6.9%,
2006.0,1008,6.1%,
2005.0,941,5.7%,
2002.0,829,5.0%,
2003.0,775,4.7%,
2004.0,763,4.6%,

Value,Count,Frequency (%),Unnamed: 3
1980.0,9,0.1%,
1981.0,46,0.3%,
1982.0,36,0.2%,
1983.0,17,0.1%,
1984.0,14,0.1%,

Value,Count,Frequency (%),Unnamed: 3
2014.0,582,3.5%,
2015.0,614,3.7%,
2016.0,344,2.1%,
2017.0,3,0.0%,
2020.0,1,0.0%,

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,GenreLabel,Genre0,Genre1,Genre2,Genre3,Genre4,Genre5
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,10,-2.0,2.0,0.0,-2.0,0.0,0.0
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,4,0.0,2.0,2.0,-1.0,1.0,0.0
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,6,-1.0,0.0,0.0,0.0,0.0,-1.0
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,10,-2.0,2.0,0.0,-2.0,0.0,0.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,7,-1.0,1.0,2.0,0.0,1.0,-1.0
