In [2]:
# Warming Up
import pandas as pd
import numpy as np
from random import randrange 
from sklearn.model_selection import train_test_split

fPath = './data/vgsales.csv'
df = pd.read_csv(fPath)

# trying to create 1.5% worth of duplicates
rws = df.shape[0]
col = df.shape[1]
rwRtio = int(.015 * rws) # float is truncated

print('Number of instances before duplication = %d' % (rws))

def eenyMeeny():
     # select a random row
    tg = df.iloc[randrange(0, rws)]
    # return randomly selected row
    return tg

for i in range(rwRtio):
    df = df.append(eenyMeeny(), ignore_index=True)
rws = df.shape[0]

print('Number of instances after duplication = %d' % (len(rws)))
print('Number of attributes = %d' % (col))
print(df.head())

Number of instances before duplication = 16598


AttributeError: 'DataFrame' object has no attribute 'append'

### Missing Values

In [3]:
data = df.replace('?',np.NaN)

print('Number of instances = %d' % (rws))
print('Number of attributes = %d' % (col))

print('Number of missing values:')
for col in df.columns:
    print('\t%s: %d' % (col,df[col].isna().sum()))

Number of instances = 16598
Number of attributes = 11
Number of missing values:
	Rank: 0
	Name: 0
	Platform: 0
	Year: 271
	Genre: 0
	Publisher: 58
	NA_Sales: 0
	EU_Sales: 0
	JP_Sales: 0
	Other_Sales: 0
	Global_Sales: 0


In [4]:
print('Number of rows in original data = %d' % (rws))
df = df.dropna()
print('Number of rows after discarding missing values = %d' % (df.shape[0]))
print('Difference in number of rows = %d' % (rws-df.shape[0]))
# update rws
rws = df.shape[0]

Number of rows in original data = 16598
Number of rows after discarding missing values = 16291
Difference in number of rows = 307


### Outliers

In [5]:
pass

### Duplicat Data

In [6]:
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))

Number of duplicate rows = 0


## Sampling

In [7]:
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [8]:
sample = data.sample(n=10)
sample

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
1324,1326,4 Nin uchi Mahjong,NES,1984.0,Misc,Nintendo,0.0,0.0,1.45,0.0,1.45
15578,15581,Metal Fight Beyblade Portable: Chouzetsu Tense...,PSP,2010.0,Action,Takara Tomy,0.0,0.0,0.02,0.0,0.02
3953,3955,Midnight Magic,2600,1983.0,Action,Atari,0.47,0.03,0.0,0.01,0.5
11076,11078,Action Man-Operation Extreme,PS,,Action,,0.05,0.03,0.0,0.01,0.09
2700,2702,Super Monkey Ball: Touch & Roll,DS,2005.0,Misc,Sega,0.7,0.01,0.0,0.06,0.76
14905,14908,Gakuen Alice: WakuWaku * Happy Friends,DS,2007.0,Adventure,Kids Station,0.0,0.0,0.03,0.0,0.03
15825,15828,Puyo Puyo Tetris,WiiU,2014.0,Puzzle,Sega,0.0,0.0,0.02,0.0,0.02
6439,6441,Gallop Racer 2: One and Only Road to Victory,PS,1997.0,Sports,Tecmo Koei,0.0,0.0,0.25,0.02,0.27
2622,2624,NCAA Football 08,PS2,2007.0,Sports,Electronic Arts,0.65,0.03,0.0,0.11,0.79
13930,13932,Train Simulator 2016,PC,2015.0,Simulation,Unknown,0.0,0.04,0.0,0.0,0.04


In [9]:
sample = data.sample(frac=0.01, random_state=999)
sample

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
10725,10727,The Land Before Time: Great Valley Racing Adve...,PS,2001.0,Racing,TDK Mediactive,0.05,0.04,0.00,0.01,0.10
9053,9055,Mojo!,PS2,2003.0,Puzzle,Mindscape,0.07,0.05,0.00,0.02,0.14
10102,10104,Vacation Sports,Wii,2009.0,Sports,Ubisoft,0.10,0.01,0.00,0.01,0.11
11876,11878,Disney's Donald Duck Advance,GBA,2001.0,Platform,Ubisoft,0.05,0.02,0.00,0.00,0.07
14561,14564,Gekiatsu!! Pachi Game Tamashi Vol. 1: CR Evang...,PSP,2011.0,Misc,Unknown,0.00,0.00,0.03,0.00,0.03
...,...,...,...,...,...,...,...,...,...,...,...
6592,6594,Silent Hunter 5: Battle of the Atlantic,PC,2010.0,Simulation,Ubisoft,0.00,0.21,0.00,0.05,0.26
4539,4541,EverQuest Online Adventures,PS2,2003.0,Role-Playing,Sony Online Entertainment,0.21,0.16,0.00,0.05,0.43
9920,9922,SD Gundam G Generation-F.I.F,PS,2001.0,Strategy,Namco Bandai Games,0.00,0.00,0.11,0.01,0.12
14440,14443,Eyeshield 21: DevilBats DevilDays,GBA,2006.0,Role-Playing,Nintendo,0.00,0.00,0.03,0.00,0.03


In [10]:
sample = data.sample(frac=0.01, replace=True, random_state=999)
sample

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
12636,12638,Adventures of Lolo,NES,1989.0,Puzzle,HAL Laboratory,0.06,0.00,0.00,0.00,0.06
13157,13159,Nike+ Kinect Training,X360,2012.0,Sports,Microsoft Game Studios,0.02,0.02,0.00,0.00,0.05
5832,5834,Karaoke Revolution Glee: Volume 3,Wii,2011.0,Misc,Konami Digital Entertainment,0.24,0.04,0.00,0.02,0.30
8417,8419,WRC 2: FIA World Rally Championship,PS3,2011.0,Racing,Ubisoft,0.00,0.13,0.00,0.04,0.17
10715,10717,Lord of Apocalypse,PSP,2011.0,Role-Playing,Square Enix,0.00,0.00,0.10,0.00,0.10
...,...,...,...,...,...,...,...,...,...,...,...
5585,5587,Resident Evil: Revelations 2,PS3,2015.0,Action,Capcom,0.06,0.08,0.16,0.03,0.32
10519,10521,God Eater 2: Rage Burst,PS4,2015.0,Role-Playing,Namco Bandai Games,0.00,0.01,0.09,0.00,0.10
6322,6324,Fushigi no Dungeon: Fuurai no Shiren GB: Tsuki...,GB,1996.0,Role-Playing,ChunSoft,0.00,0.00,0.27,0.00,0.27
1982,1984,Forza Motorsport,XB,2005.0,Racing,Microsoft Game Studios,0.52,0.51,0.00,0.02,1.05


# Shuffle the dataframe and save it

In [15]:
df = df.reindex(np.random.permutation(df.index))
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
11991,11993,Hearts of Iron III,PC,2009.0,Strategy,Paradox Interactive,0.01,0.04,0.00,0.01,0.07
9267,9269,Avatar: The Last Airbender - The Burning Earth,X360,2007.0,Action,THQ,0.11,0.01,0.00,0.01,0.14
14399,14402,Super Heroine Chronicle,PS3,2014.0,Role-Playing,Namco Bandai Games,0.00,0.00,0.03,0.00,0.03
16037,16040,WRC 5: FIA World Rally Championship,XOne,2015.0,Sports,Bigben Interactive,0.00,0.01,0.00,0.00,0.01
13143,13145,The Lost Treasures of Alexandria,DS,2011.0,Puzzle,Unknown,0.00,0.04,0.00,0.01,0.05
...,...,...,...,...,...,...,...,...,...,...,...
6031,6033,Bladestorm: The Hundred Years' War,PS3,2007.0,Action,Tecmo Koei,0.10,0.03,0.14,0.02,0.29
2093,2095,Oddworld: Abe's Exoddus,PS,1998.0,Platform,GT Interactive,0.55,0.38,0.00,0.06,0.99
12037,12039,Ultimate Board Game Collection,PSP,2007.0,Misc,Xplosiv,0.06,0.00,0.00,0.01,0.07
2035,2037,The Lost World: Jurassic Park,PS,1997.0,Action,Electronic Arts,0.57,0.39,0.00,0.07,1.02


In [16]:
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,11993,Hearts of Iron III,PC,2009.0,Strategy,Paradox Interactive,0.01,0.04,0.00,0.01,0.07
1,9269,Avatar: The Last Airbender - The Burning Earth,X360,2007.0,Action,THQ,0.11,0.01,0.00,0.01,0.14
2,14402,Super Heroine Chronicle,PS3,2014.0,Role-Playing,Namco Bandai Games,0.00,0.00,0.03,0.00,0.03
3,16040,WRC 5: FIA World Rally Championship,XOne,2015.0,Sports,Bigben Interactive,0.00,0.01,0.00,0.00,0.01
4,13145,The Lost Treasures of Alexandria,DS,2011.0,Puzzle,Unknown,0.00,0.04,0.00,0.01,0.05
...,...,...,...,...,...,...,...,...,...,...,...
16286,6033,Bladestorm: The Hundred Years' War,PS3,2007.0,Action,Tecmo Koei,0.10,0.03,0.14,0.02,0.29
16287,2095,Oddworld: Abe's Exoddus,PS,1998.0,Platform,GT Interactive,0.55,0.38,0.00,0.06,0.99
16288,12039,Ultimate Board Game Collection,PSP,2007.0,Misc,Xplosiv,0.06,0.00,0.00,0.01,0.07
16289,2037,The Lost World: Jurassic Park,PS,1997.0,Action,Electronic Arts,0.57,0.39,0.00,0.07,1.02


# Sort the dataframe

In [22]:
print("Before sorting")
print(df['Global_Sales'])
df = df.sort_values(by='Global_Sales',ascending=True)

print("\nAfter sorting")
print(df['Global_Sales']) 

Before sorting
0        0.07
1        0.14
2        0.03
3        0.01
4        0.05
         ... 
16286    0.29
16287    0.99
16288    0.07
16289    1.02
16290    0.05
Name: Global_Sales, Length: 16291, dtype: float64

After sorting
3573      0.01
6268      0.01
6304      0.01
12769     0.01
6305      0.01
         ...  
3381     31.37
9278     33.00
7896     35.82
3063     40.24
4624     82.74
Name: Global_Sales, Length: 16291, dtype: float64


In [23]:
print("df['Global_Sales'].iloc[0] is: {}".format(df['Global_Sales'].iloc[0]))

print("df['Global_Sales'].loc[0] is: {}".format(df['Global_Sales'].loc[0])) 

df['Global_Sales'].iloc[0] is: 0.01
df['Global_Sales'].loc[0] is: 0.07


In [24]:
#Sorting in Ascending Order
#Sorting by column 'NA_Sales'
df = df.sort_values(by=['NA_Sales'],ascending=True)
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
15515,8432,Sumikko Gurashi: Koko ga Ochitsukundesu,3DS,2014.0,Action,Nippon Columbia,0.00,0.00,0.17,0.00,0.17
9818,13807,Zac to Ombra: Maboroshi no Yuuenchi,DS,2010.0,Adventure,Konami Digital Entertainment,0.00,0.00,0.04,0.00,0.04
9827,13773,Dolly Kanon Dokidoki Tokimeki Himitsu no Ongak...,3DS,2014.0,Adventure,Happinet,0.00,0.00,0.04,0.00,0.04
9841,13790,Who Wants to be a Millionaire: 1st Edition,DS,2007.0,Misc,Ubisoft,0.00,0.04,0.00,0.00,0.04
16162,4701,Mobile Suit Gundam,SAT,1995.0,Action,Namco Bandai Games,0.00,0.00,0.41,0.00,0.41
...,...,...,...,...,...,...,...,...,...,...,...
7896,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
14358,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.20,2.26,4.22,0.58,30.26
12368,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31
3063,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24


In [25]:
#Sorting in Descending Order
#Sorting by column "EU_Sales"
df = df.sort_values(by=['EU_Sales'], ascending=False)
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
4624,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
7896,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
9278,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
1341,11,Nintendogs,DS,2005.0,Simulation,Nintendo,9.07,11.00,1.93,2.75,24.76
2909,17,Grand Theft Auto V,PS3,2013.0,Action,Take-Two Interactive,7.01,9.27,0.97,4.14,21.40
...,...,...,...,...,...,...,...,...,...,...,...
1669,15772,Clock Zero: Shuuen no Ichibyou - ExTime,PSV,2015.0,Action,Idea Factory,0.00,0.00,0.02,0.00,0.02
13115,6865,Jikkyou Powerful Pro Yakyuu 15,PS2,2008.0,Sports,Konami Digital Entertainment,0.00,0.00,0.24,0.00,0.24
12143,15113,School Days LxH,PS2,2008.0,Adventure,Interchannel,0.00,0.00,0.02,0.00,0.02
475,15757,L.G.S: Shinsetsu Houshinengi,PSP,2012.0,Action,Idea Factory,0.00,0.00,0.02,0.00,0.02
