<a href="https://colab.research.google.com/github/lansotto/project4/blob/main/VideoGame_Optimization_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocessing



In [1]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


In [2]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files
import tensorflow as tf
import pandas as pd 

In [3]:
# Import and read the charity_data.csv.
videogame_df = pd.read_csv("video_game_sales.csv")
videogame_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
videogame_df_renamed_columns = videogame_df.rename(columns = {'NA_Sales': 'North America', 'EU_Sales':'EU', 'JP_Sales':'Japan', 'Other_Sales':'Other', 'Global_Sales':'Global'})
videogame_df_renamed_columns

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,North America,EU,Japan,Other,Global
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [5]:
# drop columns
videogame_df_dropped = videogame_df_renamed_columns.drop(columns=['Rank', 'Name', 'Global'])

In [6]:
# Unpivot Sales Columns
videogame_df_unpivoted = pd.melt(videogame_df_dropped, id_vars=['Platform','Year','Genre','Publisher'], var_name = 'Region', value_name = 'Sales')
videogame_df_unpivoted

Unnamed: 0,Platform,Year,Genre,Publisher,Region,Sales
0,Wii,2006.0,Sports,Nintendo,North America,41.49
1,NES,1985.0,Platform,Nintendo,North America,29.08
2,Wii,2008.0,Racing,Nintendo,North America,15.85
3,Wii,2009.0,Sports,Nintendo,North America,15.75
4,GB,1996.0,Role-Playing,Nintendo,North America,11.27
...,...,...,...,...,...,...
66387,GBA,2002.0,Platform,Kemco,Other,0.00
66388,GC,2003.0,Shooter,Infogrames,Other,0.00
66389,PS2,2008.0,Racing,Activision,Other,0.00
66390,DS,2010.0,Puzzle,7G//AMES,Other,0.00


In [7]:
# Determine the number of unique values in each column.
unique_cols = videogame_df_unpivoted.nunique()
unique_cols

Platform      31
Year          39
Genre         12
Publisher    578
Region         4
Sales        495
dtype: int64

In [8]:
# Look at publisher counts for binning
publisher_val_counts = videogame_df_unpivoted['Publisher'].value_counts()
publisher_val_counts

Electronic Arts                 5404
Activision                      3900
Namco Bandai Games              3728
Ubisoft                         3684
Konami Digital Entertainment    3328
                                ... 
Warp                               4
New                                4
Elite                              4
Evolution Games                    4
UIG Entertainment                  4
Name: Publisher, Length: 578, dtype: int64

In [9]:
# Choose a cutoff value and create a list of names_replace to be replaced
# use the variable name names_replace
publisher_replace = list(publisher_val_counts[publisher_val_counts<100].index)
publisher_replace

# Replace in dataframe
for name in publisher_replace:
    videogame_df_unpivoted['Publisher'] = videogame_df_unpivoted['Publisher'].replace(name,"Other")

# Check to make sure binning was successful
videogame_df_unpivoted['Publisher'].value_counts()

Other                 9308
Electronic Arts       5404
Activision            3900
Namco Bandai Games    3728
Ubisoft               3684
                      ... 
Prototype              108
Avanquest              104
Arc System Works       104
Little Orbit           104
Telltale Games         100
Name: Publisher, Length: 77, dtype: int64

In [10]:
videogame_df_unpivoted

Unnamed: 0,Platform,Year,Genre,Publisher,Region,Sales
0,Wii,2006.0,Sports,Nintendo,North America,41.49
1,NES,1985.0,Platform,Nintendo,North America,29.08
2,Wii,2008.0,Racing,Nintendo,North America,15.85
3,Wii,2009.0,Sports,Nintendo,North America,15.75
4,GB,1996.0,Role-Playing,Nintendo,North America,11.27
...,...,...,...,...,...,...
66387,GBA,2002.0,Platform,Other,Other,0.00
66388,GC,2003.0,Shooter,Infogrames,Other,0.00
66389,PS2,2008.0,Racing,Activision,Other,0.00
66390,DS,2010.0,Puzzle,Other,Other,0.00


In [11]:
# Look at APPLICATION TYPE counts for binning
sales_val_counts = videogame_df_unpivoted['Sales'].value_counts()
sales_val_counts

0.00    27161
0.01     6186
0.02     4129
0.03     2934
0.04     2336
        ...  
3.23        1
3.50        1
4.15        1
3.25        1
7.53        1
Name: Sales, Length: 495, dtype: int64

In [12]:
# New column that identifies if a game sales is greater than $50 000 in a given region
videogame_df_unpivoted.loc[videogame_df_unpivoted['Sales'] <= 0.02, 'equal_or_lower_than_20000'] = 0 
videogame_df_unpivoted.loc[videogame_df_unpivoted['Sales'] > 0.02, 'equal_or_lower_than_20000'] = 1
videogame_df_unpivoted

Unnamed: 0,Platform,Year,Genre,Publisher,Region,Sales,equal_or_lower_than_20000
0,Wii,2006.0,Sports,Nintendo,North America,41.49,1.0
1,NES,1985.0,Platform,Nintendo,North America,29.08,1.0
2,Wii,2008.0,Racing,Nintendo,North America,15.85,1.0
3,Wii,2009.0,Sports,Nintendo,North America,15.75,1.0
4,GB,1996.0,Role-Playing,Nintendo,North America,11.27,1.0
...,...,...,...,...,...,...,...
66387,GBA,2002.0,Platform,Other,Other,0.00,0.0
66388,GC,2003.0,Shooter,Infogrames,Other,0.00,0.0
66389,PS2,2008.0,Racing,Activision,Other,0.00,0.0
66390,DS,2010.0,Puzzle,Other,Other,0.00,0.0


In [13]:
# Convert categorical data to numeric with `pd.get_dummies`
final_videogame_df = pd.get_dummies(videogame_df_unpivoted)
final_videogame_df

Unnamed: 0,Year,Sales,equal_or_lower_than_20000,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,...,Publisher_Unknown,Publisher_Virgin Interactive,Publisher_Vivendi Games,Publisher_Warner Bros. Interactive Entertainment,Publisher_Zoo Digital Publishing,Publisher_Zoo Games,Region_EU,Region_Japan,Region_North America,Region_Other
0,2006.0,41.49,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1985.0,29.08,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2008.0,15.85,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2009.0,15.75,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1996.0,11.27,1.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66387,2002.0,0.00,0.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
66388,2003.0,0.00,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
66389,2008.0,0.00,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
66390,2010.0,0.00,0.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


## Compile, Train and Evaluate the Model

In [14]:
# Split our preprocessed data into our features and target arrays
X = final_videogame_df.drop(['equal_or_lower_than_20000'], axis=1)
y = final_videogame_df['equal_or_lower_than_20000']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Obtain the training shape
X_train_scaled.shape

(49794, 126)

In [17]:
# Obtain the training input features
input_features = X_train_scaled.shape[1]

## Automated Neural Network Optimization

In [18]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model_op2 = tf.keras.models.Sequential()

    # Allow keras-tuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow keras-tuner to decide number of neurons in first layer
    nn_model_op2.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value= 90,
        step=5), activation=activation, input_dim=input_features))

    # Allow keras-tuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model_op2.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))
    
    nn_model_op2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model_op2.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model_op2

In [19]:
# Import the keras-tuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=50,
    hyperband_iterations=2)

In [20]:
# Run the keras-tuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=50,validation_data=(X_test_scaled,y_test))

Trial 180 Complete [00h 04m 23s]
val_accuracy: 0.5604892373085022

Best val_accuracy So Far: 0.5604892373085022
Total elapsed time: 02h 52m 00s


In [21]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters()[0]
best_hyper.values

{'activation': 'sigmoid',
 'first_units': 66,
 'num_layers': 6,
 'units_0': 6,
 'tuner/epochs': 2,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 3,
 'tuner/round': 0,
 'units_1': 1,
 'units_2': 1,
 'units_3': 1,
 'units_4': 1,
 'units_5': 1}

In [22]:
# Evaluate best model against full test data
best_model = tuner.get_best_models()[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

519/519 - 2s - loss: nan - accuracy: 0.5605 - 2s/epoch - 3ms/step
Loss: nan, Accuracy: 0.5604892373085022


In [23]:
# Export and save our model to HDF5 file
best_model.save('/content/Videogame_Optimization_2.h5')
files.download('/content/Videogame_Optimization_2.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>