In [1]:
import pandas as pd
import numpy as np
import re

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Combining the data files into one data frame
file_name1 = "games_data/lowest_rated_games.csv"
file_name2 = "games_data/top_rated_games.csv"
df = pd.concat(
    map(pd.read_csv, [file_name1, file_name2]), ignore_index=True)

In [3]:
# Deletion of the index column - "Unnamed: 0"
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head()

Unnamed: 0,game_name,platform,publisher,release_date,meta_scroe,user_score,develeoper,genres,num_of_players,rating,user_positive_review,user_negative_review,user_mixed_review,critic_positive_review,critic_negative_review,critic_mixed_review
0,Tharsis,PC,Choice Provisions,"Jan 11, 2016",61.0,4.8,Choice Provisions,"Miscellaneous, Board / Card Game",No Online Multiplayer,M,19,27,11.0,3.0,3.0,17.0
1,WWI Tannenberg: Eastern Front,PlayStation 4,Blackmill Games,"Jul 24, 2020",61.0,8.3,M2H,"Action, Shooter, First-Person, Tactical",Up to 40,M,4,0,2.0,3.0,1.0,6.0
2,Dungeons 2,PlayStation 4,Kalypso,"May 24, 2016",61.0,6.6,Kalypso,"Strategy, General",Up to 4,T,15,6,4.0,1.0,1.0,7.0
3,WE ARE DOOMED,PlayStation 4,Vertex Pop,"Apr 14, 2015",61.0,5.9,Vertex Pop,"Action, Shooter, Shoot-'Em-Up, Horizontal",,E,10,9,3.0,2.0,1.0,6.0
4,Sixty Second Shooter Prime,Xbox One,Happion Laboratories,"Jun 18, 2014",61.0,7.1,Happion Laboratories,"Action, Shooter, Static, Shoot-'Em-Up, Top-Down",,E,9,2,5.0,0.0,1.0,7.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7998 entries, 0 to 7997
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   game_name               7998 non-null   object 
 1   platform                7998 non-null   object 
 2   publisher               7993 non-null   object 
 3   release_date            7998 non-null   object 
 4   meta_scroe              7994 non-null   float64
 5   user_score              7997 non-null   object 
 6   develeoper              7987 non-null   object 
 7   genres                  7998 non-null   object 
 8   num_of_players          6491 non-null   object 
 9   rating                  7354 non-null   object 
 10  user_positive_review    7512 non-null   object 
 11  user_negative_review    7512 non-null   object 
 12  user_mixed_review       7512 non-null   object 
 13  critic_positive_review  7996 non-null   float64
 14  critic_negative_review  7996 non-null   

In [5]:
# Dropping the nan values for: 'meta_scroe', 'publisher', 'develeoper'
df.dropna(subset=['meta_scroe'], inplace=True)
df.dropna(subset=['publisher'], inplace=True)
df.dropna(subset=['develeoper'], inplace=True)

In [6]:
# In user_score there are some 'tbd' values 
# 'tbd' = to be declade
df[df['user_score'] == "tbd"].head()

Unnamed: 0,game_name,platform,publisher,release_date,meta_scroe,user_score,develeoper,genres,num_of_players,rating,user_positive_review,user_negative_review,user_mixed_review,critic_positive_review,critic_negative_review,critic_mixed_review
25,King Arthur,Xbox,Konami,"Nov 16, 2004",61.0,tbd,Krome Studios,"Action Adventure, Fantasy",1-2,T,0.0,1.0,1.0,2.0,2.0,11.0
30,The Messenger (2001),PC,DreamCatcher Interactive,"Feb 14, 2001",61.0,tbd,Index Multimedia,"Adventure, General, General",1 Player,T,,,,2.0,1.0,11.0
32,Hunter's Legacy,Xbox One,Lienzo,"Jan 20, 2017",61.0,tbd,Lienzo,"Action, Action Adventure, Platformer, Open-Wor...",No Online Multiplayer,E,0.0,0.0,1.0,1.0,1.0,5.0
38,Nikoli's Pencil Puzzle,3DS,Konami,"Oct 25, 2011",61.0,tbd,NATSUME ATARI Inc.,"Miscellaneous, Puzzle, Puzzle, Puzzle, Logic, ...",,E,2.0,1.0,0.0,1.0,0.0,6.0
41,Maglam Lord,PlayStation 4,PQube,"Feb 4, 2022",61.0,tbd,Felistella,"Role-Playing, Action RPG",No Online Multiplayer,T,,,,0.0,0.0,7.0


In [7]:
# changing them to nan values
df.replace("tbd", np.nan, inplace=True)
df[df["user_score"] == "tbd"]

Unnamed: 0,game_name,platform,publisher,release_date,meta_scroe,user_score,develeoper,genres,num_of_players,rating,user_positive_review,user_negative_review,user_mixed_review,critic_positive_review,critic_negative_review,critic_mixed_review


In [8]:
# Filling the user_score nan values with the mean values
df["user_score"] = df["user_score"].astype("float64")
df["user_score"].fillna(df["user_score"].mean(), inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7979 entries, 0 to 7997
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   game_name               7979 non-null   object 
 1   platform                7979 non-null   object 
 2   publisher               7979 non-null   object 
 3   release_date            7979 non-null   object 
 4   meta_scroe              7979 non-null   float64
 5   user_score              7979 non-null   float64
 6   develeoper              7979 non-null   object 
 7   genres                  7979 non-null   object 
 8   num_of_players          6480 non-null   object 
 9   rating                  7342 non-null   object 
 10  user_positive_review    7498 non-null   object 
 11  user_negative_review    7498 non-null   object 
 12  user_mixed_review       7498 non-null   object 
 13  critic_positive_review  7979 non-null   float64
 14  critic_negative_review  7979 non-null   

In [10]:
# הנתונים בעלי המיספרים הגדולים עם פסיקים במיספרים
# ולכן עמרה פשוטה לסוג אינט לא תעזור פה
df["user_positive_review"][6000]

'1,293'

In [11]:
# עמרה של כל הנתוני review
# בעלי המיספרים הגבועים
colums = ["user_positive_review", "user_negative_review", "user_mixed_review"]
for column in colums:
    nums = []
    for string in df[column]:
        if(type(string) != float):
            nums.append(int(string.replace(",", "")))
        else:
            nums.append(np.nan)
    df[column] = nums

In [12]:
columns = [
    "user_positive_review", "user_negative_review", "user_mixed_review",
    "critic_positive_review", "critic_negative_review", "critic_mixed_review"
]
for column in columns:
    df[column] = df[column].astype("float")
    df[column].fillna(df[column].mean(), inplace=True)

In [13]:
df[df["user_negative_review"].isna()]

Unnamed: 0,game_name,platform,publisher,release_date,meta_scroe,user_score,develeoper,genres,num_of_players,rating,user_positive_review,user_negative_review,user_mixed_review,critic_positive_review,critic_negative_review,critic_mixed_review


In [14]:
# making a new normalize column of 'user_positive_review'
user_review = df.iloc[:,10:13].copy()
user_review_sum = user_review["user_positive_review"] + user_review["user_negative_review"] + user_review["user_mixed_review"]
user_review["user_positive_normalize"] = user_review["user_positive_review"] / user_review_sum

df["user_positive_normelize"] = user_review["user_positive_normalize"]


In [15]:
# making a new normalize column of 'critic_positive_review'
critic_review = df.iloc[:,-4:-1].copy()
critic_review_sum = critic_review["critic_positive_review"] + critic_review["critic_mixed_review"] + critic_review["critic_negative_review"]
critic_review["critic_positive_normalize"] = critic_review["critic_positive_review"] / critic_review_sum

df["critic_positive_normelize"] = critic_review["critic_positive_normalize"]


In [16]:
df.drop(columns, axis=1, inplace=True)
df.head()

Unnamed: 0,game_name,platform,publisher,release_date,meta_scroe,user_score,develeoper,genres,num_of_players,rating,user_positive_normelize,critic_positive_normelize
0,Tharsis,PC,Choice Provisions,"Jan 11, 2016",61.0,4.8,Choice Provisions,"Miscellaneous, Board / Card Game",No Online Multiplayer,M,0.17727,0.130435
1,WWI Tannenberg: Eastern Front,PlayStation 4,Blackmill Games,"Jul 24, 2020",61.0,8.3,M2H,"Action, Shooter, First-Person, Tactical",Up to 40,M,0.061367,0.3
2,Dungeons 2,PlayStation 4,Kalypso,"May 24, 2016",61.0,6.6,Kalypso,"Strategy, General",Up to 4,T,0.182523,0.111111
3,WE ARE DOOMED,PlayStation 4,Vertex Pop,"Apr 14, 2015",61.0,5.9,Vertex Pop,"Action, Shooter, Shoot-'Em-Up, Horizontal",,E,0.124717,0.222222
4,Sixty Second Shooter Prime,Xbox One,Happion Laboratories,"Jun 18, 2014",61.0,7.1,Happion Laboratories,"Action, Shooter, Static, Shoot-'Em-Up, Top-Down",,E,0.124686,0.0


In [17]:
# Adding 'max_players' and 'online_game' columns
# With the information in the 'num_of_players' column
num_of_players_unique = list(df["num_of_players"].unique())

replace_map_for_max_players = {}
replace_map_for_online_game = {}

for val in num_of_players_unique:
    if(type(val) != float):
        max_player = np.nan
        max_player_list = [int(s) for s in re.findall(r'\b\d+\b', val)]
        if(max_player_list):
            max_player = max(max_player_list)
            replace_map_for_max_players[val] = max_player
        else:
            replace_map_for_max_players[val] = 1

        
        online = ('Online' in val) and ('No Online' not in val)
        if(max_player != np.nan):
            if(max_player >= 4):
                online = True
        
        if(online):
            replace_map_for_online_game[val] = 1
        else:
            replace_map_for_online_game[val] = 0
            
    else:
        replace_map_for_max_players[val] = 1
        replace_map_for_online_game[val] = 0
            
df["max_players"] = df["num_of_players"].copy()
df["online_game"] = df["num_of_players"].copy()

df["max_players"].replace(replace_map_for_max_players, inplace=True)
df["online_game"].replace(replace_map_for_online_game, inplace=True)

df.drop("num_of_players", axis=1, inplace=True)

In [18]:
# 1 = online
# 0 = not online
df["online_game"].unique()

array([0, 1], dtype=int64)

In [19]:
df["online_game"].value_counts()

0    5657
1    2322
Name: online_game, dtype: int64

In [20]:
df["max_players"].unique()

array([ 1, 40,  4,  2,  3, 12,  8,  6,  5, 10, 16, 64, 32, 44,  9, 24, 20,
       60, 30, 14, 22, 18, 15], dtype=int64)

In [21]:
df["max_players"].value_counts()

1     5030
2      887
4      848
8      468
16     161
6      123
12     100
10      94
32      56
3       36
64      32
22      28
24      25
20      23
5       21
18      19
14       6
40       5
30       5
9        4
60       4
44       3
15       1
Name: max_players, dtype: int64

In [22]:
# df = pd.get_dummies(df, columns=["online_game"], prefix=["online_game"])

In [23]:
df.head()

Unnamed: 0,game_name,platform,publisher,release_date,meta_scroe,user_score,develeoper,genres,rating,user_positive_normelize,critic_positive_normelize,max_players,online_game
0,Tharsis,PC,Choice Provisions,"Jan 11, 2016",61.0,4.8,Choice Provisions,"Miscellaneous, Board / Card Game",M,0.17727,0.130435,1,0
1,WWI Tannenberg: Eastern Front,PlayStation 4,Blackmill Games,"Jul 24, 2020",61.0,8.3,M2H,"Action, Shooter, First-Person, Tactical",M,0.061367,0.3,40,1
2,Dungeons 2,PlayStation 4,Kalypso,"May 24, 2016",61.0,6.6,Kalypso,"Strategy, General",T,0.182523,0.111111,4,1
3,WE ARE DOOMED,PlayStation 4,Vertex Pop,"Apr 14, 2015",61.0,5.9,Vertex Pop,"Action, Shooter, Shoot-'Em-Up, Horizontal",E,0.124717,0.222222,1,0
4,Sixty Second Shooter Prime,Xbox One,Happion Laboratories,"Jun 18, 2014",61.0,7.1,Happion Laboratories,"Action, Shooter, Static, Shoot-'Em-Up, Top-Down",E,0.124686,0.0,1,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7979 entries, 0 to 7997
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   game_name                  7979 non-null   object 
 1   platform                   7979 non-null   object 
 2   publisher                  7979 non-null   object 
 3   release_date               7979 non-null   object 
 4   meta_scroe                 7979 non-null   float64
 5   user_score                 7979 non-null   float64
 6   develeoper                 7979 non-null   object 
 7   genres                     7979 non-null   object 
 8   rating                     7342 non-null   object 
 9   user_positive_normelize    7979 non-null   float64
 10  critic_positive_normelize  7979 non-null   float64
 11  max_players                7979 non-null   int64  
 12  online_game                7979 non-null   int64  
dtypes: float64(4), int64(2), object(7)
memory usage:

In [25]:
# We will change the names of the platforms to make them more general
df["platform"].unique()

array(['PC', 'PlayStation 4', 'Xbox One', 'PSP', '3DS', 'Switch',
       'Xbox 360', 'PlayStation 2', 'PlayStation 3', 'DS', 'Xbox',
       'Game Boy Advance', 'PlayStation', 'PlayStation Vita', 'Dreamcast',
       'PlayStation 5', 'GameCube', 'Wii U', 'Xbox Series X', 'Wii',
       'Nintendo 64'], dtype=object)

In [26]:
platform_replace_map = {
    'PSP': 'PlayStation',
    '3DS': 'Nintendo',
    'Switch': 'Nintendo',
    'DS': 'Nintendo',
    'Game Boy Advance': 'Nintendo',
    'Dreamcast': 'Old Platform',
    'GameCube': 'Old Platform',
    'Nintendo 64': 'Nintendo',
    'Stadia': 'PC'
}
platform_unique_list = list(df["platform"].unique())
for val in platform_unique_list:
    replaceXbox = re.findall(r'(Xbox) ', val)
    replacePlayStation = re.findall(r'(PlayStation) ', val)
    replaceWii = re.findall(r'(Wii) ', val)
    if(replaceXbox):
        platform_replace_map[val] = replaceXbox[0]
    if(replacePlayStation):
        platform_replace_map[val] = replacePlayStation[0]
    if(replaceWii):
        platform_replace_map[val] = replaceWii[0]

df["platform"].replace(platform_replace_map, inplace=True)

In [27]:
df["platform"].unique()

array(['PC', 'PlayStation', 'Xbox', 'Nintendo', 'Old Platform', 'Wii'],
      dtype=object)

In [28]:
# We will assign to each rating a value related to the order of the rating
df["rating"].unique()

array(['M', 'T', 'E', 'E10+', nan, 'RP', 'AO', 'K-A'], dtype=object)

In [29]:
# first we replace the nan valuse with the top value:
top_val = df["rating"].describe()[2]
df["rating"].fillna(top_val, inplace=True)

In [30]:
# RP - Rating Pending == 0
# E - Everyone 6 and older == 1
# E10+ - 10 and older == 2
# K-A - 10 and older == 2
# T - Teen 13 and older == 3
# m - 17+ == 4
# AO - Adults Only 18 years and older == 5

rating_replace_map = {
    'RP': 0, 'E': 1, 'E10+': 2,
    'K-A': 2, 'T': 3, 'M': 4, 'AO': 5}

df["rating"].replace(rating_replace_map, inplace=True)

In [31]:
# scaling the 'meta_scroe' and 'user_score' columns
min_max_scaler = preprocessing.MinMaxScaler()
meta_scroe_scaled = min_max_scaler.fit_transform(df[['meta_scroe']])
user_score_scaled = min_max_scaler.fit_transform(df[['user_score']])
df['meta_score'] = meta_scroe_scaled
df['user_score'] = user_score_scaled
df.drop("meta_scroe", axis=1, inplace=True)

In [32]:
# We will divide to two columns: 'release_month' and 'release_year'
df["release_date"].unique()

array(['Jan 11, 2016', 'Jul 24, 2020', 'May 24, 2016', ...,
       'Mar 27, 2013', 'Sep  7, 2016', 'Mar 14, 2019'], dtype=object)

In [33]:
import calendar

In [34]:
# Getting months by their numbers
month_number = {}
for index, month in enumerate(calendar.month_abbr):
    if(month):
        month_number[month.lower()] = index

# Divides into two columns:
release_date_list = list(df["release_date"])
release_year = []
release_month = []
for date in release_date_list:
    year = re.findall(r'\b\d+\b', date)[1]
    release_year.append(int(year))
    month = re.findall(r'(\w\w\w) ', date)[0]
    release_month.append(month_number[month.lower()])

df['release_year'] = release_year
df['release_month'] = release_month

In [35]:
df.drop("release_date", axis=1, inplace=True)

In [36]:
df.head()

Unnamed: 0,game_name,platform,publisher,user_score,develeoper,genres,rating,user_positive_normelize,critic_positive_normelize,max_players,online_game,meta_score,release_year,release_month
0,Tharsis,PC,Choice Provisions,0.478723,Choice Provisions,"Miscellaneous, Board / Card Game",4,0.17727,0.130435,1,0,0.568182,2016,1
1,WWI Tannenberg: Eastern Front,PlayStation,Blackmill Games,0.851064,M2H,"Action, Shooter, First-Person, Tactical",4,0.061367,0.3,40,1,0.568182,2020,7
2,Dungeons 2,PlayStation,Kalypso,0.670213,Kalypso,"Strategy, General",3,0.182523,0.111111,4,1,0.568182,2016,5
3,WE ARE DOOMED,PlayStation,Vertex Pop,0.595745,Vertex Pop,"Action, Shooter, Shoot-'Em-Up, Horizontal",1,0.124717,0.222222,1,0,0.568182,2015,4
4,Sixty Second Shooter Prime,Xbox,Happion Laboratories,0.723404,Happion Laboratories,"Action, Shooter, Static, Shoot-'Em-Up, Top-Down",1,0.124686,0.0,1,0,0.568182,2014,6


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7979 entries, 0 to 7997
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   game_name                  7979 non-null   object 
 1   platform                   7979 non-null   object 
 2   publisher                  7979 non-null   object 
 3   user_score                 7979 non-null   float64
 4   develeoper                 7979 non-null   object 
 5   genres                     7979 non-null   object 
 6   rating                     7979 non-null   int64  
 7   user_positive_normelize    7979 non-null   float64
 8   critic_positive_normelize  7979 non-null   float64
 9   max_players                7979 non-null   int64  
 10  online_game                7979 non-null   int64  
 11  meta_score                 7979 non-null   float64
 12  release_year               7979 non-null   int64  
 13  release_month              7979 non-null   int64

In [38]:
# The genres are listed in long lists
# We need to extract the main genres
df["genres"]

0                        Miscellaneous, Board / Card Game
1                 Action, Shooter, First-Person, Tactical
2                                       Strategy, General
3               Action, Shooter, Shoot-'Em-Up, Horizontal
4         Action, Shooter, Static, Shoot-'Em-Up, Top-Down
                              ...                        
7993    Strategy, Real-Time, General, General, Fantasy...
7994             Strategy, Turn-Based, Real-Time, Tactics
7995    Miscellaneous, Action Adventure, Edutainment, ...
7996                       Puzzle, Action, Platformer, 3D
7997          Action, Miscellaneous, Rhythm, Music, Music
Name: genres, Length: 7979, dtype: object

In [39]:
# Extracting the names of the genres
genres_names = set()
for genre in df["genres"]:
    genre_list = genre.split(", ")
    for name in genre_list:
        if(name):
            genres_names.add(name)
len(genres_names)

166

In [40]:
# To check which genres are the main ones, we will count each one 
# and define the main genres as the genres with the most iterations in the data frame
genres_count = {'': 0}
for genre in genres_names:
    counter = 0
    for val in df["genres"]:
        if(genre in val):
            counter = counter + 1
    genres_count[genre] = counter
len(genres_count)

167

In [41]:
main_genres_per_game = []
for game_genres in df["genres"]:
    genre_list = game_genres.split(", ")
    max_genre = max(genre_list, key=lambda val: genres_count[val])
    main_genres_per_game.append(max_genre)
df["main_genre"] = main_genres_per_game

In [42]:
df.drop("genres", axis=1, inplace=True)

In [43]:
columns_order = ["game_name", "platform", "main_genre",
                 "develeoper", "publisher", "max_players",
                 "online_game", "release_year", "release_month",
                 "rating", "meta_score", "user_score",
                 "critic_positive_normelize", "user_positive_normelize",
                ]
df = df[columns_order]
df.head()

Unnamed: 0,game_name,platform,main_genre,develeoper,publisher,max_players,online_game,release_year,release_month,rating,meta_score,user_score,critic_positive_normelize,user_positive_normelize
0,Tharsis,PC,Miscellaneous,Choice Provisions,Choice Provisions,1,0,2016,1,4,0.568182,0.478723,0.130435,0.17727
1,WWI Tannenberg: Eastern Front,PlayStation,Action,M2H,Blackmill Games,40,1,2020,7,4,0.568182,0.851064,0.3,0.061367
2,Dungeons 2,PlayStation,General,Kalypso,Kalypso,4,1,2016,5,3,0.568182,0.670213,0.111111,0.182523
3,WE ARE DOOMED,PlayStation,Action,Vertex Pop,Vertex Pop,1,0,2015,4,1,0.568182,0.595745,0.222222,0.124717
4,Sixty Second Shooter Prime,Xbox,Action,Happion Laboratories,Happion Laboratories,1,0,2014,6,1,0.568182,0.723404,0.0,0.124686


In [44]:
df["develeoper"] = df["develeoper"].astype('category')
df["publisher"] = df["publisher"].astype('category')

In [45]:
print(df[df["game_name"] == "Elden Ring"].mean())

max_players                     4.000000
online_game                     1.000000
release_year                 2022.000000
release_month                   2.000000
rating                          4.000000
meta_score                      0.958333
user_score                      0.758865
critic_positive_normelize       0.983333
user_positive_normelize         0.707012
dtype: float64


  print(df[df["game_name"] == "Elden Ring"].mean())


In [46]:
print(df[df["game_name"] == "Assassin's Creed Origins"].mean())

max_players                     1.000000
online_game                     0.000000
release_year                 2017.000000
release_month                  10.000000
rating                          4.000000
meta_score                      0.821970
user_score                      0.744681
critic_positive_normelize       0.872925
user_positive_normelize         0.656484
dtype: float64


  print(df[df["game_name"] == "Assassin's Creed Origins"].mean())


In [47]:
df[df["game_name"] == "Assassin's Creed Origins"]

Unnamed: 0,game_name,platform,main_genre,develeoper,publisher,max_players,online_game,release_year,release_month,rating,meta_score,user_score,critic_positive_normelize,user_positive_normelize
5769,Assassin's Creed Origins,Xbox,Action Adventure,Ubisoft,Ubisoft,1,0,2017,10,4,0.840909,0.787234,0.880952,0.743562
6447,Assassin's Creed Origins,PC,Action Adventure,Ubisoft,Ubisoft,1,0,2017,10,4,0.829545,0.702128,0.896552,0.584393
7995,Assassin's Creed Origins,PlayStation,Action Adventure,Ubisoft,Ubisoft,1,0,2017,10,4,0.795455,0.744681,0.84127,0.641498


In [50]:
duplicates = df[df["game_name"].duplicated(keep=False)].copy()
duplicates_names = list(duplicates["game_name"].unique())
df.reset_index(drop=True, inplace=True)

for duplicate_name in duplicates_names:
    duplicates_game = df[df["game_name"] == duplicate_name]
    
    meta_score_mean = duplicates_game["meta_score"].mean()
    user_score_mean = duplicates_game["user_score"].mean()
    critic_mean = duplicates_game["critic_positive_normelize"].mean()
    user_mean = duplicates_game["user_positive_normelize"].mean()
    
    row_index = df[df["game_name"] == duplicate_name].index[0]
    df.loc[row_index, "meta_score"] = meta_score_mean
    df.loc[row_index, "user_score"] = user_score_mean
    df.loc[row_index, "critic_positive_normelize"] = critic_mean
    df.loc[row_index, "user_positive_normelize"] = user_mean
    
    platforms = set()
    for val in duplicates_game["platform"]:
        platforms.add(val)
    platforms = sorted(list(platforms))
    df.loc[row_index, "platform"] = ", ".join(platforms)
    
    year_month_zip = zip(duplicates_game["release_year"], duplicates_game["release_month"])
    release_year = min(duplicates_game["release_year"])
    release_month = 12
    for year, month in year_month_zip:
        if(year == release_year and month < release_month):
            release_month = month
    df.loc[row_index, "release_year"] = release_year
    df.loc[row_index, "release_month"] = release_month

df.drop_duplicates(subset="game_name", inplace=True)
df.reset_index(drop=True, inplace=True)

In [51]:
exclusive_games_list = []
for platforms in df["platform"]:
    platforms_list = platforms.split(", ")
    if(len(platforms_list) != 1 or 'Old Platform' in platforms_list):
        exclusive_games_list.append(0)
    else:
        exclusive_games_list.append(1)
df['exclusive_game'] = exclusive_games_list

In [52]:
df[df["game_name"] == "Elden Ring"]

Unnamed: 0,game_name,platform,main_genre,develeoper,publisher,max_players,online_game,release_year,release_month,rating,meta_score,user_score,critic_positive_normelize,user_positive_normelize,exclusive_game
3114,Elden Ring,"PC, PlayStation, Xbox",Role-Playing,From Software,Bandai Namco Games,4,1,2022,2,4,0.958333,0.758865,0.983333,0.707012,0


In [53]:
df[df["game_name"] == "Assassin's Creed Origins"]

Unnamed: 0,game_name,platform,main_genre,develeoper,publisher,max_players,online_game,release_year,release_month,rating,meta_score,user_score,critic_positive_normelize,user_positive_normelize,exclusive_game
4296,Assassin's Creed Origins,"PC, PlayStation, Xbox",Action Adventure,Ubisoft,Ubisoft,1,0,2017,10,4,0.82197,0.744681,0.872925,0.656484,0


In [54]:
df[df["meta_score"] >= 0.9].head()

Unnamed: 0,game_name,platform,main_genre,develeoper,publisher,max_players,online_game,release_year,release_month,rating,meta_score,user_score,critic_positive_normelize,user_positive_normelize,exclusive_game
3088,The Legend of Zelda: Ocarina of Time,Nintendo,Action Adventure,Nintendo,Nintendo,1,0,1998,11,1,1.0,0.93617,1.0,0.894765,1
3089,Tony Hawk's Pro Skater 2,"Nintendo, Old Platform, PC, PlayStation",Sports,Neversoft Entertainment,Activision,2,0,2000,9,3,0.931818,0.76383,0.970909,0.716163,0
3090,Grand Theft Auto IV,"PC, PlayStation, Xbox",Action Adventure,Rockstar North,Rockstar Games,1,0,2008,4,4,0.958333,0.780142,0.983333,0.680995,0
3091,SoulCalibur,Old Platform,Action,Namco,Namco,2,0,1999,9,3,0.988636,0.861702,1.0,0.823383,0
3092,Super Mario Galaxy,Wii,Action,Nintendo,Nintendo,1,0,2007,11,1,0.977273,0.93617,1.0,0.910638,1


In [55]:
df["platform"].unique()

array(['Nintendo, PC', 'PlayStation', 'Xbox', 'PlayStation, Xbox',
       'Nintendo', 'Nintendo, PlayStation', 'PC, Xbox', 'PC',
       'Nintendo, PC, PlayStation, Xbox',
       'Old Platform, PlayStation, Xbox', 'PC, PlayStation, Xbox',
       'Nintendo, PlayStation, Wii, Xbox', 'PC, PlayStation',
       'Nintendo, Old Platform, PlayStation, Xbox', 'Nintendo, Xbox',
       'Old Platform', 'Nintendo, PC, PlayStation, Wii, Xbox',
       'Old Platform, PlayStation', 'Wii', 'Nintendo, PC, PlayStation',
       'Nintendo, PC, Xbox', 'Old Platform, PC', 'PC, Wii, Xbox',
       'PlayStation, Wii, Xbox', 'Nintendo, Old Platform',
       'Nintendo, Old Platform, PlayStation', 'PlayStation, Wii',
       'PC, PlayStation, Wii, Xbox', 'Nintendo, PlayStation, Xbox',
       'PC, PlayStation, Wii', 'Old Platform, PC, PlayStation, Xbox',
       'Old Platform, Xbox', 'Nintendo, Wii',
       'Old Platform, PlayStation, Wii', 'Nintendo, PlayStation, Wii',
       'Nintendo, Old Platform, PC, PlayStation, 

In [56]:
df["release_year"].unique()

array([2016, 2020, 2015, 2014, 2007, 2011, 2019, 2006, 2017, 2021, 2018,
       2013, 2010, 2009, 2008, 2004, 2002, 2001, 2005, 2022, 2003, 2000,
       2012, 1999, 1998, 1997, 1996, 1995], dtype=int64)

In [57]:
for string in df["platform"].unique():
    if "Nintendo" in string:
        temp = df[df["platform"] == string]["release_year"].unique()
        print(f"{string} --> {temp}")

Nintendo, PC --> [2016 2020 2002 2019 2004 2009 2005 2011 2017 2022 2018 2012 2021 2015
 2014 2010]
Nintendo --> [2011 2019 2004 2005 2021 2020 2002 2018 2009 2022 2007 2017 2012 2001
 2008 2015 2014 2006 2010 2003 2016 2013 2000 1998 1996 1999 1997]
Nintendo, PlayStation --> [2007 2021 2006 2005 2017 2018 2020 2022 2008 2011 2014 2019 2012 2009
 2000 2003 1999 1997 2016 2015 2013 2002]
Nintendo, PC, PlayStation, Xbox --> [2009 2017 2020 2002 2011 2006 2021 2019 2018 2022 2016 2012 2010 2015
 2007 2014]
Nintendo, PlayStation, Wii, Xbox --> [2011 2010 2008 2009 2007 2014 2013]
Nintendo, Old Platform, PlayStation, Xbox --> [2002 2006 2004 2005 2003]
Nintendo, Xbox --> [2017 2004 2008 2021 2020 2002 2006 2018 2022 2019]
Nintendo, PC, PlayStation, Wii, Xbox --> [2007 2008 2010 2014]
Nintendo, PC, PlayStation --> [2021 2018 2017 2002 2015 2019 2022 2016 2008 2014 2020]
Nintendo, PC, Xbox --> [2008 2016 2017 2003 2020 2021 2018 2022]
Nintendo, Old Platform --> [2005 2002 2000]
Nintendo, Old 

In [58]:
for string in df["platform"].unique():
    if "Old Platform" in string:
        temp = df[df["platform"] == string]["release_year"].unique()
        print(f"{string} --> {temp}")

Old Platform, PlayStation, Xbox --> [2004 2003 2002 2006 2005 2001 2000]
Nintendo, Old Platform, PlayStation, Xbox --> [2002 2006 2004 2005 2003]
Old Platform --> [2000 2001 2002 2003 2007 2005 1999 2004 2006]
Old Platform, PlayStation --> [2003 2002 2001 2004 2006 2005 2000 1999]
Old Platform, PC --> [2000 2001]
Nintendo, Old Platform --> [2005 2002 2000]
Nintendo, Old Platform, PlayStation --> [2005 2002]
Old Platform, PC, PlayStation, Xbox --> [2005 2003 2002 2000 2004]
Old Platform, Xbox --> [2001 2002 2004 2005 2003]
Old Platform, PlayStation, Wii --> [2006]
Nintendo, Old Platform, PC, PlayStation, Xbox --> [1998 2006 2001 2002]
Old Platform, Wii, Xbox --> [2002]
Old Platform, PC, PlayStation --> [2000 2005]
Old Platform, PC, PlayStation, Wii --> [2006]
Nintendo, Old Platform, Xbox --> [2006 2003]
Nintendo, Old Platform, PC, PlayStation --> [2000]
Old Platform, Wii --> [2006]


In [59]:
# platform
# main_genre
# develeoper
# publisher

In [60]:
# Labeling the 'publisher' and 'develeoper' columns
# df["publisher_labeled"] = LabelEncoder().fit_transform(df["publisher"])
# df["develeoper_labeled"] = LabelEncoder().fit_transform(df["develeoper"])
# df["main_genres_labeled"] = LabelEncoder().fit_transform(df["main_genres"])

# Labeling the 'platform' column
# df["platform_labeled"] = LabelEncoder().fit_transform(df["platform"])