In [127]:
# Initial imports
import pandas as pd
import numpy as np
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [273]:
# Read data
games_df = pd.read_csv('Cleaned_Data/all_columns_df.csv')
games_df

Unnamed: 0,Rank,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006.0,Japan,82.86
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,10.0,,1985.0,Japan,40.24
2,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008.0,Japan,37.14
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017.0,,36.60
4,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,2009.0,Japan,33.09
...,...,...,...,...,...,...,...,...,...,...,...,...
19857,19858,FirePower for Microsoft Combat Flight Simulator 3,Simulation,T,PC,GMX Media,Shockwave Productions,,,2004.0,,0.01
19858,19859,Tom Clancy's Splinter Cell,Shooter,T,PC,Ubisoft,Ubisoft,,,2003.0,Europe,0.01
19859,19860,Ashita no Joe 2: The Anime Super Remix,Fighting,,PS2,Capcom,Capcom,,,2002.0,Japan,0.01
19860,19861,Tokyo Yamanote Boys for V: Main Disc,Adventure,,PSV,Rejet,Rejet,,,2017.0,,0.01


In [274]:
# Count no NaNs in each column
games_df.count()

Rank            19862
Name            19862
Genre           19862
ESRB_Rating     13925
Platform        19862
Publisher       19862
Developer_x     19860
Critic_Score     4706
User_Score        238
Year            19859
Country         11877
Total_Sales     19862
dtype: int64

In [275]:
# Drop columns
games_df.drop(['Rank', 'User_Score', 'Year', 'Critic_Score', 'Country'], axis=1, inplace=True)

In [276]:
games_df.count()

Name           19862
Genre          19862
ESRB_Rating    13925
Platform       19862
Publisher      19862
Developer_x    19860
Total_Sales    19862
dtype: int64

In [277]:
games_df.dropna().count()

Name           13923
Genre          13923
ESRB_Rating    13923
Platform       13923
Publisher      13923
Developer_x    13923
Total_Sales    13923
dtype: int64

In [278]:
games_df = games_df.dropna()
games_df.head(10)

Unnamed: 0,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Total_Sales
0,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,82.86
2,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,37.14
4,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,33.09
5,Pokemon Red / Green / Blue Version,Role-Playing,E,GB,Nintendo,Game Freak,31.38
6,New Super Mario Bros.,Platform,E,DS,Nintendo,Nintendo EAD,30.8
7,Tetris,Puzzle,E,GB,Nintendo,Bullet Proof Software,30.26
8,New Super Mario Bros. Wii,Platform,E,Wii,Nintendo,Nintendo EAD,30.22
11,Wii Play,Misc,E,Wii,Nintendo,Nintendo EAD,28.02
12,Kinect Adventures!,Party,E,X360,Microsoft Game Studios,Good Science Studio,24.0
13,Nintendogs,Simulation,E,DS,Nintendo,Nintendo EAD,23.96


In [280]:
cleaned_games_df = games_df.copy()

In [281]:
games_df.nunique()

Name           8848
Genre            20
ESRB_Rating       6
Platform         31
Publisher       432
Developer_x    2250
Total_Sales     626
dtype: int64

In [282]:
# New dataframe to hold the game's name
name_df = games_df.filter(['Name'])
name_df

Unnamed: 0,Name
0,Wii Sports
2,Mario Kart Wii
4,Wii Sports Resort
5,Pokemon Red / Green / Blue Version
6,New Super Mario Bros.
...,...
19850,This Is The Police
19854,Nancy Drew: The Deadly Secret of Olde World Park
19856,Donkey Kong
19857,FirePower for Microsoft Combat Flight Simulator 3


In [283]:
# Drop 'Name' column
games_df.drop(['Name'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df.drop(['Name'], axis=1, inplace=True)


In [284]:
games_df.Genre.value_counts()

Action              2233
Sports              2009
Shooter             1332
Racing              1268
Misc                1163
Role-Playing        1123
Adventure            937
Platform             923
Simulation           795
Fighting             683
Puzzle               515
Strategy             500
Action-Adventure     256
Music                103
MMO                   30
Party                 29
Visual Novel          20
Education              2
Sandbox                1
Board Game             1
Name: Genre, dtype: int64

In [285]:
# Keep top 14 of Genre
top_Genre = games_df.Genre.value_counts().index[0:15]
games_df.Genre = np.where(games_df.Genre.isin(top_Genre), games_df.Genre, 'other')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df.Genre = np.where(games_df.Genre.isin(top_Genre), games_df.Genre, 'other')


In [286]:
games_df.Genre.value_counts()

Action              2233
Sports              2009
Shooter             1332
Racing              1268
Misc                1163
Role-Playing        1123
Adventure            937
Platform             923
Simulation           795
Fighting             683
Puzzle               515
Strategy             500
Action-Adventure     256
Music                103
other                 53
MMO                   30
Name: Genre, dtype: int64

In [287]:
games_df.Platform.value_counts()

PS2     1695
DS      1613
Wii     1184
X360    1160
PC      1059
PS3     1013
PS       970
XB       831
GBA      747
PS4      664
PSP      634
GC       545
XOne     433
3DS      344
N64      291
NS       207
PSV      182
WiiU     143
SAT       61
GB        48
SNES      34
DC        31
GBC        9
PSN        9
XBL        5
GEN        4
VC         3
NES        1
SCD        1
OSX        1
3DO        1
Name: Platform, dtype: int64

In [288]:
# Keep top 14 Platform
#top_Platform = games_df.Platform.value_counts().where(games_df.Platform.value_counts() > 100).index
top_Platform = games_df.Platform.value_counts().index[0:15]
games_df.Platform = np.where(games_df.Platform.isin(top_Platform), games_df.Platform, 'other')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df.Platform = np.where(games_df.Platform.isin(top_Platform), games_df.Platform, 'other')


In [289]:
games_df.Platform.value_counts()

PS2      1695
DS       1613
Wii      1184
X360     1160
PC       1059
PS3      1013
PS        970
XB        831
GBA       747
other     740
PS4       664
PSP       634
GC        545
XOne      433
3DS       344
N64       291
Name: Platform, dtype: int64

In [290]:
top_Publisher = games_df.Publisher.value_counts().index[0:15]
games_df.Publisher = np.where(games_df.Publisher.isin(top_Publisher), games_df.Publisher, 'other')
games_df.Publisher.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df.Publisher = np.where(games_df.Publisher.isin(top_Publisher), games_df.Publisher, 'other')


other                          6636
Activision                      970
Ubisoft                         888
Electronic Arts                 787
THQ                             700
Nintendo                        546
EA Sports                       517
Konami                          469
Sony Computer Entertainment     466
Sega                            466
Capcom                          371
Atari                           261
Midway Games                    242
Atlus                           207
Square Enix                     202
Majesco                         195
Name: Publisher, dtype: int64

In [291]:
games_df.Developer_x.value_counts()

EA Canada                     277
Capcom                        232
Konami                        201
EA Tiburon                    191
Ubisoft Montreal              159
                             ... 
Team Ramrod                     1
SVG Distribution                1
Hypnos                          1
Unique Development Studios      1
Shockwave Productions           1
Name: Developer_x, Length: 2250, dtype: int64

In [292]:
top_Developer_x = games_df.Developer_x.value_counts().index[0:15]
games_df.Developer_x = np.where(games_df.Developer_x.isin(top_Developer_x), games_df.Developer_x, 'other')
games_df.Developer_x.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df.Developer_x = np.where(games_df.Developer_x.isin(top_Developer_x), games_df.Developer_x, 'other')


other                11820
EA Canada              277
Capcom                 232
Konami                 201
EA Tiburon             191
Ubisoft Montreal       159
Ubisoft                157
Traveller's Tales      150
Visual Concepts        131
Omega Force            111
Square Enix             90
Vicarious Visions       88
Namco                   82
Unknown                 80
THQ                     79
Nintendo EAD            75
Name: Developer_x, dtype: int64

In [293]:
games_df.nunique()

Genre           16
ESRB_Rating      6
Platform        16
Publisher       16
Developer_x     16
Total_Sales    626
dtype: int64

In [294]:
games_df

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Total_Sales
0,Sports,E,Wii,Nintendo,Nintendo EAD,82.86
2,Racing,E,Wii,Nintendo,Nintendo EAD,37.14
4,Sports,E,Wii,Nintendo,Nintendo EAD,33.09
5,Role-Playing,E,other,Nintendo,other,31.38
6,Platform,E,DS,Nintendo,Nintendo EAD,30.80
...,...,...,...,...,...,...
19850,Simulation,M,XOne,other,other,0.01
19854,Adventure,E,DS,Majesco,other,0.01
19856,Platform,E,3DS,Nintendo,other,0.01
19857,Simulation,T,PC,other,other,0.01


In [295]:
games_df.dtypes

Genre           object
ESRB_Rating     object
Platform        object
Publisher       object
Developer_x     object
Total_Sales    float64
dtype: object

In [296]:
# Encoding object dtype columns
cat = games_df.select_dtypes(include='object')
cat = list(cat.columns)
cat

['Genre', 'ESRB_Rating', 'Platform', 'Publisher', 'Developer_x']

In [297]:
from sklearn.preprocessing import OneHotEncoder

# creating instance of one-hot-encoder
enc = OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(games_df[cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(cat)

encode_df



Unnamed: 0,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_MMO,Genre_Misc,Genre_Music,Genre_Platform,Genre_Puzzle,Genre_Racing,...,Developer_x_Omega Force,Developer_x_Square Enix,Developer_x_THQ,Developer_x_Traveller's Tales,Developer_x_Ubisoft,Developer_x_Ubisoft Montreal,Developer_x_Unknown,Developer_x_Vicarious Visions,Developer_x_Visual Concepts,Developer_x_other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13919,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [298]:
# Reset X dataframe index to merge with encode_df
games_df.reset_index(drop=True, inplace=True)
games_df

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Total_Sales
0,Sports,E,Wii,Nintendo,Nintendo EAD,82.86
1,Racing,E,Wii,Nintendo,Nintendo EAD,37.14
2,Sports,E,Wii,Nintendo,Nintendo EAD,33.09
3,Role-Playing,E,other,Nintendo,other,31.38
4,Platform,E,DS,Nintendo,Nintendo EAD,30.80
...,...,...,...,...,...,...
13918,Simulation,M,XOne,other,other,0.01
13919,Adventure,E,DS,Majesco,other,0.01
13920,Platform,E,3DS,Nintendo,other,0.01
13921,Simulation,T,PC,other,other,0.01


In [299]:
# Merge one-hot encoded features and drop the originals
games_df = games_df.merge(encode_df, left_index=True, right_index=True)
games_df = games_df.drop(cat,1)
games_df

  games_df = games_df.drop(cat,1)


Unnamed: 0,Total_Sales,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Fighting,Genre_MMO,Genre_Misc,Genre_Music,Genre_Platform,Genre_Puzzle,...,Developer_x_Omega Force,Developer_x_Square Enix,Developer_x_THQ,Developer_x_Traveller's Tales,Developer_x_Ubisoft,Developer_x_Ubisoft Montreal,Developer_x_Unknown,Developer_x_Vicarious Visions,Developer_x_Visual Concepts,Developer_x_other
0,82.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,31.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,30.80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13918,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13919,0.01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13920,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13921,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [300]:
X_scaled = StandardScaler().fit_transform(games_df)
X_scaled

array([[47.54626185, -0.43705602, -0.13686216, ..., -0.07975387,
        -0.09745902, -2.37076825],
       [21.10692004, -0.43705602, -0.13686216, ..., -0.07975387,
        -0.09745902, -2.37076825],
       [18.76485236, -0.43705602, -0.13686216, ..., -0.07975387,
        -0.09745902, -2.37076825],
       ...,
       [-0.36492514, -0.43705602, -0.13686216, ..., -0.07975387,
        -0.09745902,  0.4218042 ],
       [-0.36492514, -0.43705602, -0.13686216, ..., -0.07975387,
        -0.09745902,  0.4218042 ],
       [-0.36492514, -0.43705602, -0.13686216, ..., -0.07975387,
        -0.09745902, -2.37076825]])

In [301]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[ 6.75857329,  2.27452754, 11.97144946],
       [ 4.47869988,  0.11506312,  8.43606817],
       [ 5.23100909,  0.69109569,  7.63097131],
       ...,
       [ 1.26216721, -1.73469317,  3.30764218],
       [-1.6768045 , -0.55157437, -0.95257383],
       [-0.40696843,  2.78757376,  2.22889794]])

In [302]:
pcs_df = pd.DataFrame(
    data = X_pca, columns = ["PC 1", "PC 2", "PC 3"]
)
print(pcs_df.shape)
pcs_df.head(10)

(13923, 3)


Unnamed: 0,PC 1,PC 2,PC 3
0,6.758573,2.274528,11.971449
1,4.4787,0.115063,8.436068
2,5.231009,0.691096,7.630971
3,1.403533,0.052318,5.131891
4,4.363913,-0.257689,9.228507
5,2.034606,-1.145621,5.171257
6,4.342474,0.009209,8.807013
7,4.129715,-0.214055,8.185166
8,0.547952,-0.148903,2.133548
9,3.919873,-0.619879,8.0829


In [303]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Find the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [304]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["Class"] = model.labels_
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3,Class
0,6.758573,2.274528,11.971449,2
1,4.478700,0.115063,8.436068,2
2,5.231009,0.691096,7.630971,2
3,1.403533,0.052318,5.131891,2
4,4.363913,-0.257689,9.228507,2
...,...,...,...,...
13918,-1.520359,-0.056825,-0.555643,0
13919,0.763302,-2.244733,1.175015,1
13920,1.262167,-1.734693,3.307642,1
13921,-1.676804,-0.551574,-0.952574,0


In [308]:
cleaned_games_df.reset_index(drop=True, inplace=True)
clustered_df = pd.concat([cleaned_games_df, pcs_df], axis = 1)
clustered_df.head(20)

Unnamed: 0,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Total_Sales,PC 1,PC 2,PC 3,Class
0,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,82.86,6.758573,2.274528,11.971449,2
1,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,37.14,4.4787,0.115063,8.436068,2
2,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,33.09,5.231009,0.691096,7.630971,2
3,Pokemon Red / Green / Blue Version,Role-Playing,E,GB,Nintendo,Game Freak,31.38,1.403533,0.052318,5.131891,2
4,New Super Mario Bros.,Platform,E,DS,Nintendo,Nintendo EAD,30.8,4.363913,-0.257689,9.228507,2
5,Tetris,Puzzle,E,GB,Nintendo,Bullet Proof Software,30.26,2.034606,-1.145621,5.171257,2
6,New Super Mario Bros. Wii,Platform,E,Wii,Nintendo,Nintendo EAD,30.22,4.342474,0.009209,8.807013,2
7,Wii Play,Misc,E,Wii,Nintendo,Nintendo EAD,28.02,4.129715,-0.214055,8.185166,2
8,Kinect Adventures!,Party,E,X360,Microsoft Game Studios,Good Science Studio,24.0,0.547952,-0.148903,2.133548,1
9,Nintendogs,Simulation,E,DS,Nintendo,Nintendo EAD,23.96,3.919873,-0.619879,8.0829,2


In [306]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="Name",
    hover_data=["Genre"]  
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [311]:
games_class_df = pd.DataFrame(
    data = clustered_df,
    columns = ['Name', 'Class']
)
games_class_df

Unnamed: 0,Name,Class
0,Wii Sports,2
1,Mario Kart Wii,2
2,Wii Sports Resort,2
3,Pokemon Red / Green / Blue Version,2
4,New Super Mario Bros.,2
...,...,...
13918,This Is The Police,0
13919,Nancy Drew: The Deadly Secret of Olde World Park,1
13920,Donkey Kong,1
13921,FirePower for Microsoft Combat Flight Simulator 3,0


In [337]:
df = pd.read_csv('Cleaned_Data/all_columns_df.csv')
df

Unnamed: 0,Rank,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006.0,Japan,82.86
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,10.0,,1985.0,Japan,40.24
2,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008.0,Japan,37.14
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017.0,,36.60
4,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,2009.0,Japan,33.09
...,...,...,...,...,...,...,...,...,...,...,...,...
19857,19858,FirePower for Microsoft Combat Flight Simulator 3,Simulation,T,PC,GMX Media,Shockwave Productions,,,2004.0,,0.01
19858,19859,Tom Clancy's Splinter Cell,Shooter,T,PC,Ubisoft,Ubisoft,,,2003.0,Europe,0.01
19859,19860,Ashita no Joe 2: The Anime Super Remix,Fighting,,PS2,Capcom,Capcom,,,2002.0,Japan,0.01
19860,19861,Tokyo Yamanote Boys for V: Main Disc,Adventure,,PSV,Rejet,Rejet,,,2017.0,,0.01


In [354]:
merged_df = pd.merge(df, games_class_df, left_on=['Name'], right_on=['Name'])
merged_df.head(50)

Unnamed: 0,Rank,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales,Class
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006.0,Japan,82.86,2
1,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008.0,Japan,37.14,2
2,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017.0,,36.6,0
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017.0,,36.6,0
4,906,PlayerUnknown's Battlegrounds,Shooter,T,XOne,Microsoft Studios,PUBG Corporation,,,2017.0,,2.17,0
5,906,PlayerUnknown's Battlegrounds,Shooter,T,XOne,Microsoft Studios,PUBG Corporation,,,2017.0,,2.17,0
6,6315,PlayerUnknown's Battlegrounds,Shooter,T,PS4,Sony Interactive Entertainment,PUBG Corporation,,,2018.0,,0.33,0
7,6315,PlayerUnknown's Battlegrounds,Shooter,T,PS4,Sony Interactive Entertainment,PUBG Corporation,,,2018.0,,0.33,0
8,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,2009.0,Japan,33.09,2
9,6,Pokemon Red / Green / Blue Version,Role-Playing,E,GB,Nintendo,Game Freak,9.4,,1998.0,Japan,31.38,2


In [356]:
merged_df.drop_duplicates(subset=['Rank'], keep='first', inplace=True, ignore_index=True)
merged_df

Unnamed: 0,Rank,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales,Class
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006.0,Japan,82.86,2
1,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008.0,Japan,37.14,2
2,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017.0,,36.60,0
3,906,PlayerUnknown's Battlegrounds,Shooter,T,XOne,Microsoft Studios,PUBG Corporation,,,2017.0,,2.17,0
4,6315,PlayerUnknown's Battlegrounds,Shooter,T,PS4,Sony Interactive Entertainment,PUBG Corporation,,,2018.0,,0.33,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14307,19833,The Aly & AJ Adventure,Adventure,E,DS,"Destination Software, Inc",Frame Studios Interactive,,,2007.0,,0.01,1
14308,19837,Farming Simulator 17: Big Bud,Simulation,E,PC,Maximum Games,Giants Software,,,2017.0,,0.01,1
14309,19839,Medieval: Total War Gold Edition,Strategy,T,PC,Sega,The Creative Assembly,,,2006.0,United States,0.01,0
14310,19855,Nancy Drew: The Deadly Secret of Olde World Park,Adventure,E,DS,Majesco,Gorilla Systems,,,2007.0,,0.01,1


In [357]:
merged_df.count()

Rank            14312
Name            14312
Genre           14312
ESRB_Rating     13923
Platform        14312
Publisher       14312
Developer_x     14312
Critic_Score     4590
User_Score        229
Year            14311
Country          9599
Total_Sales     14312
Class           14312
dtype: int64