<a href="https://colab.research.google.com/github/lansotto/project4/blob/main/VideoGame_Optimization_50k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocessing



In [1]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


In [2]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files
import tensorflow as tf
import pandas as pd 

In [3]:
# Import and read the charity_data.csv.
videogame_df = pd.read_csv("final_videoGameSales_df.csv")
videogame_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006-01-01,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985-01-01,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008-01-01,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009-01-01,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996-01-01,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
# drop columns
videogame_df_dropped = videogame_df.drop(columns=['Rank', 'Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])
videogame_df_dropped

Unnamed: 0,Platform,Year,Genre,Publisher,Global_Sales
0,Wii,2006-01-01,Sports,Nintendo,82.74
1,NES,1985-01-01,Platform,Nintendo,40.24
2,Wii,2008-01-01,Racing,Nintendo,35.82
3,Wii,2009-01-01,Sports,Nintendo,33.00
4,GB,1996-01-01,Role-Playing,Nintendo,31.37
...,...,...,...,...,...
16286,GBA,2002-01-01,Platform,Kemco,0.01
16287,GC,2003-01-01,Shooter,Infogrames,0.01
16288,PS2,2008-01-01,Racing,Activision,0.01
16289,DS,2010-01-01,Puzzle,7G//AMES,0.01


In [5]:
# Determine the number of unique values in each column.
unique_cols = videogame_df_dropped.nunique()
unique_cols

Platform         31
Year             39
Genre            12
Publisher       576
Global_Sales    621
dtype: int64

In [6]:
# Look at publisher counts for binning
publisher_val_counts = videogame_df_dropped['Publisher'].value_counts()
publisher_val_counts

Electronic Arts                 1339
Activision                       966
Namco Bandai Games               928
Ubisoft                          918
Konami Digital Entertainment     823
                                ... 
Detn8 Games                        1
Pow                                1
Navarre Corp                       1
MediaQuest                         1
UIG Entertainment                  1
Name: Publisher, Length: 576, dtype: int64

In [7]:
# Choose a cutoff value and create a list of names_replace to be replaced
# use the variable name names_replace
publisher_replace = list(publisher_val_counts[publisher_val_counts<100].index)
publisher_replace

# Replace in dataframe
for name in publisher_replace:
    videogame_df_dropped['Publisher'] = videogame_df_dropped['Publisher'].replace(name,"Other")

# Check to make sure binning was successful
videogame_df_dropped['Publisher'].value_counts()

Other                                     4452
Electronic Arts                           1339
Activision                                 966
Namco Bandai Games                         928
Ubisoft                                    918
Konami Digital Entertainment               823
THQ                                        712
Nintendo                                   696
Sony Computer Entertainment                682
Sega                                       632
Take-Two Interactive                       412
Capcom                                     376
Atari                                      347
Tecmo Koei                                 338
Square Enix                                231
Warner Bros. Interactive Entertainment     217
Disney Interactive Studios                 214
Midway Games                               196
Eidos Interactive                          196
505 Games                                  192
Microsoft Game Studios                     189
Acclaim Enter

In [8]:
videogame_df_dropped

Unnamed: 0,Platform,Year,Genre,Publisher,Global_Sales
0,Wii,2006-01-01,Sports,Nintendo,82.74
1,NES,1985-01-01,Platform,Nintendo,40.24
2,Wii,2008-01-01,Racing,Nintendo,35.82
3,Wii,2009-01-01,Sports,Nintendo,33.00
4,GB,1996-01-01,Role-Playing,Nintendo,31.37
...,...,...,...,...,...
16286,GBA,2002-01-01,Platform,Other,0.01
16287,GC,2003-01-01,Shooter,Other,0.01
16288,PS2,2008-01-01,Racing,Activision,0.01
16289,DS,2010-01-01,Puzzle,Other,0.01


In [9]:
# Look at Sales counts for binning
sales_val_counts = videogame_df_dropped['Global_Sales'].value_counts()
sales_val_counts

0.02    1045
0.03     795
0.04     634
0.05     624
0.01     600
        ... 
4.98       1
5.01       1
5.05       1
5.07       1
3.16       1
Name: Global_Sales, Length: 621, dtype: int64

In [11]:
# New column that identifies if a game sales is greater than $50 000 in a given region
videogame_df_dropped.loc[videogame_df_dropped['Global_Sales'] <= 0.05, 'hit_or_miss'] = 0 
videogame_df_dropped.loc[videogame_df_dropped['Global_Sales'] > 0.05, 'hit_or_miss'] = 1
videogame_df_dropped

Unnamed: 0,Platform,Year,Genre,Publisher,Global_Sales,hit_or_miss
0,Wii,2006-01-01,Sports,Nintendo,82.74,1.0
1,NES,1985-01-01,Platform,Nintendo,40.24,1.0
2,Wii,2008-01-01,Racing,Nintendo,35.82,1.0
3,Wii,2009-01-01,Sports,Nintendo,33.00,1.0
4,GB,1996-01-01,Role-Playing,Nintendo,31.37,1.0
...,...,...,...,...,...,...
16286,GBA,2002-01-01,Platform,Other,0.01,0.0
16287,GC,2003-01-01,Shooter,Other,0.01,0.0
16288,PS2,2008-01-01,Racing,Activision,0.01,0.0
16289,DS,2010-01-01,Puzzle,Other,0.01,0.0


In [12]:
# Convert categorical data to numeric with `pd.get_dummies`
final_videogame_df = pd.get_dummies(videogame_df_dropped)
final_videogame_df

Unnamed: 0,Global_Sales,hit_or_miss,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,Platform_GC,...,Publisher_Sony Computer Entertainment,Publisher_Square Enix,Publisher_THQ,Publisher_Take-Two Interactive,Publisher_Tecmo Koei,Publisher_Ubisoft,Publisher_Unknown,Publisher_Vivendi Games,Publisher_Warner Bros. Interactive Entertainment,Publisher_Zoo Digital Publishing
0,82.74,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,40.24,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,35.82,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33.00,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,31.37,1.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16286,0.01,0.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
16287,0.01,0.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
16288,0.01,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16289,0.01,0.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Compile, Train and Evaluate the Model

In [13]:
# Split our preprocessed data into our features and target arrays
X = final_videogame_df.drop(['hit_or_miss'], axis=1)
y = final_videogame_df['hit_or_miss']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [14]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Obtain the training shape
X_train_scaled.shape

(12218, 113)

In [16]:
# Obtain the training input features
input_features = X_train_scaled.shape[1]

## Automated Neural Network Optimization

In [17]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model_op2 = tf.keras.models.Sequential()

    # Allow keras-tuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow keras-tuner to decide number of neurons in first layer
    nn_model_op2.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value= 90,
        step=5), activation=activation, input_dim=input_features))

    # Allow keras-tuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 3)):
        nn_model_op2.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))
    
    nn_model_op2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model_op2.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model_op2

In [18]:
# Import the keras-tuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=50,
    hyperband_iterations=2)

In [19]:
# Run the keras-tuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=50,validation_data=(X_test_scaled,y_test))

Trial 180 Complete [00h 01m 23s]
val_accuracy: 0.9467223286628723

Best val_accuracy So Far: 0.9543334245681763
Total elapsed time: 00h 58m 04s


In [20]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters()[0]
best_hyper.values

{'activation': 'sigmoid',
 'first_units': 11,
 'num_layers': 1,
 'units_0': 11,
 'units_1': 11,
 'units_2': 6,
 'tuner/epochs': 50,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [21]:
# Evaluate best model against full test data
best_model = tuner.get_best_models()[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

128/128 - 0s - loss: 0.1278 - accuracy: 0.9543 - 386ms/epoch - 3ms/step
Loss: 0.1278228610754013, Accuracy: 0.9543334245681763


In [22]:
# Export and save our model to HDF5 file
best_model.save('/content/Videogame_Optimization_50.h5')
files.download('/content/Videogame_Optimization_50.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>