## Data Preparation

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
# load merged_df.pkl
with open('merged_df.pkl', 'rb') as f:
    merged_df = pickle.load(f)

In [3]:
merged_df.head()

Unnamed: 0,name,release_year,cooperative,platform,rating,user_rating,players,publisher,developer,summary,release_date,filename,genres
0,20th Century Video Almanac,1993.0,0.0,3DO Interactive Multiplayer,,6.559701,1.0,The Software Toolworks,The Software Toolworks,"In The Best of Our Century, we've taken multim...",NaT,,Education
1,3D Atlas,1994.0,0.0,3DO Interactive Multiplayer,,7.0,1.0,Electronic Arts,Electronic Arts,The World Isn't Flat. Why Should Your Atlas Be...,NaT,,Education
2,3DO Action Pak,1995.0,0.0,3DO Interactive Multiplayer,,7.413333,1.0,3DO,3DO,This is a four-game compilation pack that cont...,NaT,,Action
3,3DO de Shiru Miru Asobu Nakajima Miyuki,,0.0,3DO Interactive Multiplayer,,7.333333,1.0,Pony Canyon,,,NaT,,
4,3DO Demo Disc Program,,0.0,3DO Interactive Multiplayer,,6.745455,,,,A white binder with blue silk-screened art. Th...,NaT,,


check name-platform duplicates, should be none

In [4]:
duplicates = merged_df[merged_df.duplicated(subset=['name', 'platform'], keep=False)]
duplicates.sort_values(['name', 'platform'])
duplicates.head()

Unnamed: 0,name,release_year,cooperative,platform,rating,user_rating,players,publisher,developer,summary,release_date,filename,genres


extract year from date and fill missing years

In [5]:
def extract_year(date):
    if date is not None:
        return pd.to_datetime(date).year
    else:
        return None

In [6]:
merged_df['release_year'].fillna(merged_df['release_date'].apply(extract_year), inplace=True)

dealing with genres. some lists were comma separated and some used semicolons

In [7]:
# replace semicolons with commas
merged_df['genres'] = merged_df['genres'].str.replace(';', ',')

# to lowercase
merged_df['genres'] = merged_df['genres'].str.lower()

# Split the 'genres' column into lists
merged_df['genres'] = merged_df['genres'].str.split(',')


In [8]:
# Function to remove leading and trailing spaces and remove duplicates from a list
def clean_genre_list(genre_list):
    if genre_list is None:
        return None
    cleaned_list = [genre.strip() for genre in genre_list]
    cleaned_list = ['platformer' if genre == 'platform' else genre for genre in cleaned_list]
    return list(set(cleaned_list))

In [9]:
merged_df['genres'] = merged_df['genres'].apply(clean_genre_list)

miscellaneous preparations

In [10]:
merged_df.dtypes

name                    object
release_year           float64
cooperative             object
platform                object
rating                  object
user_rating             object
players                float64
publisher               object
developer               object
summary                 object
release_date    datetime64[ns]
filename                object
genres                  object
dtype: object

In [11]:
# convert columns to appropriate formats
merged_df['platform'] = merged_df['platform'].astype('category')

In [12]:
merged_df['rating'] = merged_df['rating'].astype('int')


TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [13]:
merged_df['user_rating'] = pd.to_numeric(merged_df['user_rating'], errors='coerce')

In [19]:
# replace all None with NaN
merged_df = merged_df.replace(np.nan, None)

In [20]:
def round_rating(rating):
    if rating is not None:
        return round(rating, 1)
    else:
        return None

In [23]:
# round user_rating to 1 decimal place
merged_df['user_rating'] = merged_df['user_rating'].apply(round_rating)

In [24]:
merged_df = merged_df.sort_values(['name', 'platform'])
merged_df.head()

Unnamed: 0,name,release_year,cooperative,platform,rating,user_rating,players,publisher,developer,summary,release_date,filename,genres
118507,! That Bastard Is Trying To Steal Our Gold !,,0.0,Windows,,,,WTFOMGames,WTFOMGames,Steal gold from the Lerpikon's dungeons! Get r...,NaT,,
117413,!!!Ants!!!,1979.0,0.0,Tandy TRS-80,,5.0,1.0,Synergistic Solar Inc.,Brian Rotolante,In Ants you can become an ant and join the ran...,NaT,,
118508,!AnyWay!,,0.0,Windows,,,,EYEFRONT,EYEFRONT,Do you like parkour? Dashing retro times? AnyW...,NaT,,
85692,"""300""",1975.0,0.0,Pinball,,7.0,4.0,Gottlieb,Gottlieb,"""300"" (the exact machine name includes the quo...",NaT,,[action]
85693,"""8 Ball""",1952.0,0.0,Pinball,,6.1,1.0,Williams,Williams,,NaT,,[action]
