# Universidad del Valle de Guatemala
# Inteligencia Artificial - CC3045
## Authors:
## Julio Herrera - 19402
## Juan Pablo Pineda - 19087
## Diego Crespo - 19541

### Dataset:
#### games_of_all_time.csv - Games of All Time from Metacritic - https://www.kaggle.com/datasets/xcherry/games-of-all-time-from-metacritic?resource=download
#### vgsales.csv - Video Game Sales - https://www.kaggle.com/datasets/gregorut/videogamesales

In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import random

In [2]:
games_meta = pd.read_csv('games_of_all_time.csv')
games_sales = pd.read_csv('vgsales.csv')

# Juntando datasets

Removiendo variables que no se usarán

In [3]:
games_meta = games_meta.drop(columns=['description', 'url', 'type', 'rating'])
games_sales = games_sales.drop(columns=['Rank', 'Genre', 'Publisher'])

Renombrando variables para facilitar su lectura

In [4]:
games_meta.columns = ['name', 'meta_score', 'user_score', 'platform', 'developer', 'genre']
games_sales.columns = ['name', 'platform', 'year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']

Limpiando datos

In [5]:
# remove the 'HD' at the end of the name on every row
games_sales['name'] = games_sales['name'].str.replace(' HD', '')
games_sales['name'] = games_sales['name'].str.replace(' 3D', '')
games_meta['name'] = games_meta['name'].str.replace(' HD', '')
games_meta['name'] = games_meta['name'].str.replace(' 3D', '')

# create new rows by separating the platforms list
games_meta['platform'] = games_meta['platform'].apply(literal_eval)
games_meta = games_meta.explode('platform')

# drop columns with na value in genre and developer column
games_meta = games_meta.dropna(subset=['genre'])
games_meta = games_meta.dropna(subset=['developer'])

# create new rows by separating the genre list
games_meta['genre'] = games_meta['genre'].apply(literal_eval)
games_meta = games_meta.explode('genre')

# if platform is '3ds' change to '3DS' without using replace
games_meta['platform'] = games_meta['platform'].apply(lambda x: '3DS' if x == '3ds' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'N64' if x == 'nintendo-64' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'XB' if x == 'xbox' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'X360' if x == 'xbox-360' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'XOne' if x == 'xbox-one' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'PS' if x == 'playstation' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'PS2' if x == 'playstation-2' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'PS3' if x == 'playstation-3' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'PS4' if x == 'playstation-4' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'PSV' if x == 'playstation-vita' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'PSP' if x == 'psp' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'PC' if x == 'pc' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'Wii' if x == 'wii' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'WiiU' if x == 'wii-u' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'GC' if x == 'gamecube' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'DS' if x == 'ds' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'DC' if x == 'dreamcast' else x)
games_meta['platform'] = games_meta['platform'].apply(lambda x: 'GBA' if x == 'game-boy-advance' else x)

In [6]:
# Mergeando ambos datasets
games_meta_sales = pd.merge(games_meta, games_sales, on=['name', 'platform'])
games_meta_sales

Unnamed: 0,name,meta_score,user_score,platform,developer,genre,year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,N64,Nintendo,Action Adventure,1998.0,4.10,1.89,1.45,0.16,7.60
1,The Legend of Zelda: Ocarina of Time,99.0,91.0,N64,Nintendo,Fantasy,1998.0,4.10,1.89,1.45,0.16,7.60
2,Super Mario Galaxy,97.0,91.0,Wii,Nintendo,Action,2007.0,6.16,3.40,1.20,0.76,11.52
3,Super Mario Galaxy,97.0,91.0,Wii,Nintendo,Platformer,2007.0,6.16,3.40,1.20,0.76,11.52
4,Super Mario Galaxy,97.0,91.0,Wii,Nintendo,3D,2007.0,6.16,3.40,1.20,0.76,11.52
...,...,...,...,...,...,...,...,...,...,...,...,...
15754,htoL#NiQ: The Firefly Diary,58.0,75.0,PSV,Nippon Ichi Software,General,2014.0,0.00,0.00,0.01,0.00,0.01
15755,htoL#NiQ: The Firefly Diary,58.0,75.0,PSV,Nippon Ichi Software,Platformer,2014.0,0.00,0.00,0.01,0.00,0.01
15756,htoL#NiQ: The Firefly Diary,58.0,75.0,PSV,Nippon Ichi Software,2D,2014.0,0.00,0.00,0.01,0.00,0.01
15757,uDraw Studio,71.0,71.0,Wii,THQ,Miscellaneous,2010.0,1.67,0.58,0.00,0.20,2.46


Categorizando

In [7]:
# Categorise values in platform column
platforms = games_meta_sales['platform'].values
platforms = np.unique(platforms)
platforms_dict = {}
for i in range(len(platforms)):
    platforms_dict[platforms[i]] = i
games_meta_sales['platform'] = games_meta_sales['platform'].apply(lambda x: platforms_dict[x])

In [8]:
# Categorise values in genre column
genres = games_meta_sales['genre'].values
genres = np.unique(genres)
genres_dict = {}
for i in range(len(genres)):
    genres_dict[genres[i]] = i
games_meta_sales['genre'] = games_meta_sales['genre'].apply(lambda x: genres_dict[x])

In [9]:
# Categorise values in developer column
developer = games_meta_sales['developer'].values
developer_dict = np.unique(developer)
developer_dict = {}
for i in range(len(developer)):
    developer_dict[developer[i]] = i
games_meta_sales['developer'] = games_meta_sales['developer'].apply(lambda x: developer_dict[x])
games_meta_sales

Unnamed: 0,name,meta_score,user_score,platform,developer,genre,year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,5,15445,4,1998.0,4.10,1.89,1.45,0.16,7.60
1,The Legend of Zelda: Ocarina of Time,99.0,91.0,5,15445,40,1998.0,4.10,1.89,1.45,0.16,7.60
2,Super Mario Galaxy,97.0,91.0,13,15445,3,2007.0,6.16,3.40,1.20,0.76,11.52
3,Super Mario Galaxy,97.0,91.0,13,15445,96,2007.0,6.16,3.40,1.20,0.76,11.52
4,Super Mario Galaxy,97.0,91.0,13,15445,1,2007.0,6.16,3.40,1.20,0.76,11.52
...,...,...,...,...,...,...,...,...,...,...,...,...
15754,htoL#NiQ: The Firefly Diary,58.0,75.0,12,15756,51,2014.0,0.00,0.00,0.01,0.00,0.01
15755,htoL#NiQ: The Firefly Diary,58.0,75.0,12,15756,96,2014.0,0.00,0.00,0.01,0.00,0.01
15756,htoL#NiQ: The Firefly Diary,58.0,75.0,12,15756,0,2014.0,0.00,0.00,0.01,0.00,0.01
15757,uDraw Studio,71.0,71.0,13,15758,77,2010.0,1.67,0.58,0.00,0.20,2.46


# Analisis de variables

In [10]:
# histogram
printAnalisis = False
if printAnalisis:
    cuantitatives = ['meta_score', 'user_score', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
    for i in cuantitatives:
        plt.hist(games_meta_sales[i], bins=20)
        plt.title(i)
        plt.show()
        qqplot(games_meta_sales[i])
        plt.title(i)
        plt.show()
        print(i)
        print('skew: ', games_meta_sales[i].skew())
        print('kurtosis: ', games_meta_sales[i].kurtosis())
        print('\n')

In [11]:
# frequency bar plot
if printAnalisis:
    cualitative = ['platform', 'genre', 'developer', 'year']
    for i in cualitative:
        plt.figure(figsize=(20, 5))
        games_meta_sales[i].value_counts().plot(kind='bar')
        plt.title(i)
        plt.show()
        games_meta_sales[i].describe()
        if i == 'platform':
            print(platforms_dict)
        elif i == 'genre':
            print(genres_dict)
        elif i == 'developer':
            print(developer_dict)

# Aplicando modelo de RNA

Se añade una columna para referenciar al año en el que se calcularon las ventas del dataset (2020)

In [12]:
games_meta_sales['time_on_market'] = 2020 - games_meta_sales['year']

In [13]:
# Y is sales of different regions
Y = games_meta_sales[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']].values
Y

array([[4.100e+00, 1.890e+00, 1.450e+00, 1.600e-01, 7.600e+00],
       [4.100e+00, 1.890e+00, 1.450e+00, 1.600e-01, 7.600e+00],
       [6.160e+00, 3.400e+00, 1.200e+00, 7.600e-01, 1.152e+01],
       ...,
       [0.000e+00, 0.000e+00, 1.000e-02, 0.000e+00, 1.000e-02],
       [1.670e+00, 5.800e-01, 0.000e+00, 2.000e-01, 2.460e+00],
       [1.670e+00, 5.800e-01, 0.000e+00, 2.000e-01, 2.460e+00]])

In [14]:
X = games_meta_sales[['meta_score', 'user_score', 'time_on_market', 'platform', 'genre', 'developer']].values
X

array([[9.9000e+01, 9.1000e+01, 2.2000e+01, 5.0000e+00, 4.0000e+00,
        1.5445e+04],
       [9.9000e+01, 9.1000e+01, 2.2000e+01, 5.0000e+00, 4.0000e+01,
        1.5445e+04],
       [9.7000e+01, 9.1000e+01, 1.3000e+01, 1.3000e+01, 3.0000e+00,
        1.5445e+04],
       ...,
       [5.8000e+01, 7.5000e+01, 6.0000e+00, 1.2000e+01, 0.0000e+00,
        1.5756e+04],
       [7.1000e+01, 7.1000e+01, 1.0000e+01, 1.3000e+01, 7.7000e+01,
        1.5758e+04],
       [7.1000e+01, 7.1000e+01, 1.0000e+01, 1.3000e+01, 5.1000e+01,
        1.5758e+04]])

In [15]:
np.random.seed(0)
random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(11031, 6) (11031, 5)
(4728, 6) (4728, 5)


In [19]:
hidden_layers = (150, 100, 50)
activation = 'relu'
lri = 0.0001
max_iter = 10000

rna_model = MLPRegressor(hidden_layer_sizes=hidden_layers, activation=activation, learning_rate_init=lri, max_iter=max_iter, verbose=True, random_state=0)

# x_train values to integers
x_train = x_train.astype('int')
rna_model.fit(x_train, y_train)

Iteration 1, loss = 94060086803437.00000000
Iteration 2, loss = 83271085030.51507568
Iteration 3, loss = 449529751.21972388
Iteration 4, loss = 993929.59891288
Iteration 5, loss = 8700.15393985
Iteration 6, loss = 1456.92624406
Iteration 7, loss = 1368.15852498
Iteration 8, loss = 1589.48064559
Iteration 9, loss = 1457.78562290
Iteration 10, loss = 1534.50742905
Iteration 11, loss = 41995.88027450
Iteration 12, loss = 1726.28286503
Iteration 13, loss = 2988.33396122
Iteration 14, loss = 1741.27694339
Iteration 15, loss = 1527.34181378
Iteration 16, loss = 8409.59977880
Iteration 17, loss = 1612.46940610
Iteration 18, loss = 1626.70810306
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPRegressor(hidden_layer_sizes=(150, 100, 50), learning_rate_init=0.0001,
             max_iter=10000, random_state=0, verbose=True)

In [20]:
# x_train values to integers
x_test = x_test.astype('int')
y_pred = rna_model.predict(x_test)
y_pred

array([[ 0.36568159, -1.44848795,  0.06034415,  1.29214758,  2.67393224],
       [ 1.22585479, -0.11133228, -0.4282549 , -0.65651212, -2.12956564],
       [ 0.51177283, -0.45518231,  0.17877258,  0.45667894,  1.40218167],
       ...,
       [ 1.76428824, -1.1548213 , -0.23664896,  1.70803974,  2.20628479],
       [ 1.02636718, -0.1288084 ,  2.76433354,  1.22958713,  0.82441109],
       [ 1.65447961, -0.32816867, -0.18535578, -0.11013434, -0.89857897]])

In [21]:
accuracy = mean_squared_error(y_test, y_pred)
precision = r2_score(y_test, y_pred)
score = rna_model.score(x_test, y_test)
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Score: ', score)

Accuracy:  3312.952406602224
Precision:  -27796.269418432166
Score:  -27796.269418432166
