In [1]:
# Import the modules
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
import sqlite3


## 1. Split the Data into Training and Testing Sets

### 1.1 Read the preproccessed Pokemon data from the `Resources` folder into a Pandas DataFrame.

In [2]:

# Making a connection between sqlite3
# database and Python Program
cnx = sqlite3.connect('../Resources/pokemon.sqlite')
     

# Getting all tables from sqlite_master
sql_query = """SELECT name FROM sqlite_master
WHERE type='table';"""
 
# Creating cursor object using connection object
cursor = cnx.cursor()
     
# executing our sql query
cursor.execute(sql_query)
print("List of tables:")
     
# printing all tables list
print(cursor.fetchall())


List of tables:
[('pokemon',)]


In [3]:
#Turn the table into a pandas dataframe
df = pd.read_sql_query("SELECT * FROM pokemon", cnx)

# Review the DataFrame
df.head()

Unnamed: 0,_ID,Number_First,First_Name,Type_1_First,Type_2_First,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,...,Attack_Second,Defense_Second,Sp_Atk_Second,Sp_Def_Second,Speed_Second,Generation_Second,Legendary_Second,Tier_Second,Battle_number,Did_the_first_pokemon_win
0,65,372,Barboach,Water,Ground,50,48,43,46,41,...,49,49,65,65,45,1,0,LC,1,1
1,42104,1,Bulbasaur,Grass,Poison,45,49,49,65,65,...,48,43,46,41,60,3,0,LC,1,0
2,79,462,Mothim,Bug,Flying,70,94,50,94,50,...,49,49,65,65,45,1,0,LC,2,1
3,51724,1,Bulbasaur,Grass,Poison,45,49,49,65,65,...,94,50,94,50,66,4,0,Untiered,2,0
4,0,6,Charmeleon,Fire,,58,64,58,80,65,...,49,49,65,65,45,1,0,LC,3,1


In [4]:
#Close the connection to the sqlite db
cnx.close()

### 1.2 Clean the data frame by dropping columns we don't need for machine learning

In [5]:
#print columns
df.columns

Index(['_ID', 'Number_First', 'First_Name', 'Type_1_First', 'Type_2_First',
       'HP_First', 'Attack_First', 'Defense_First', 'Sp_Atk_First',
       'Sp_Def_First', 'Speed_First', 'Generation_First', 'Legendary_First',
       'Tier_First', 'Number_Second', 'Second_Name', 'Type_1_Second',
       'Type_2_Second', 'HP_Second', 'Attack_Second', 'Defense_Second',
       'Sp_Atk_Second', 'Sp_Def_Second', 'Speed_Second', 'Generation_Second',
       'Legendary_Second', 'Tier_Second', 'Battle_number',
       'Did_the_first_pokemon_win'],
      dtype='object')

In [6]:
#drop the columns that are not useful for the ML mdoel
df.drop(['_ID', 'Number_First', 'First_Name', 'Number_Second', 'Second_Name', 'Battle_number'], axis=1, inplace=True)

df.head()

Unnamed: 0,Type_1_First,Type_2_First,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,Speed_First,Generation_First,Legendary_First,...,HP_Second,Attack_Second,Defense_Second,Sp_Atk_Second,Sp_Def_Second,Speed_Second,Generation_Second,Legendary_Second,Tier_Second,Did_the_first_pokemon_win
0,Water,Ground,50,48,43,46,41,60,3,0,...,45,49,49,65,65,45,1,0,LC,1
1,Grass,Poison,45,49,49,65,65,45,1,0,...,50,48,43,46,41,60,3,0,LC,0
2,Bug,Flying,70,94,50,94,50,66,4,0,...,45,49,49,65,65,45,1,0,LC,1
3,Grass,Poison,45,49,49,65,65,45,1,0,...,70,94,50,94,50,66,4,0,Untiered,0
4,Fire,,58,64,58,80,65,80,1,0,...,45,49,49,65,65,45,1,0,LC,1


### 1.3 Create dummies for non-numeric columns
We use drop_first to avoid multicollinearity.

In [7]:
#Create dummies
df = pd.get_dummies(data=df, columns=[
    'Type_1_First','Type_2_First','Generation_First','Legendary_First','Tier_First',
    'Type_1_Second','Type_2_Second','Generation_Second','Legendary_Second', 'Tier_Second'], drop_first=True)

# Display sample data
df.head()

Unnamed: 0,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,Speed_First,HP_Second,Attack_Second,Defense_Second,Sp_Atk_Second,...,Tier_Second_NUBL,Tier_Second_OU,Tier_Second_PU,Tier_Second_PUBL,Tier_Second_RU,Tier_Second_RUBL,Tier_Second_UU,Tier_Second_UUBL,Tier_Second_Uber,Tier_Second_Untiered
0,50,48,43,46,41,60,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
1,45,49,49,65,65,45,50,48,43,46,...,0,0,0,0,0,0,0,0,0,0
2,70,94,50,94,50,66,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
3,45,49,49,65,65,45,70,94,50,94,...,0,0,0,0,0,0,0,0,0,1
4,58,64,58,80,65,80,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#print out all the columns to verify which ones exist
df.columns.values.tolist()

#The original columns for the dummies have been dropped automatically

['HP_First',
 'Attack_First',
 'Defense_First',
 'Sp_Atk_First',
 'Sp_Def_First',
 'Speed_First',
 'HP_Second',
 'Attack_Second',
 'Defense_Second',
 'Sp_Atk_Second',
 'Sp_Def_Second',
 'Speed_Second',
 'Did_the_first_pokemon_win',
 'Type_1_First_Dark',
 'Type_1_First_Dragon',
 'Type_1_First_Electric',
 'Type_1_First_Fairy',
 'Type_1_First_Fighting',
 'Type_1_First_Fire',
 'Type_1_First_Flying',
 'Type_1_First_Ghost',
 'Type_1_First_Grass',
 'Type_1_First_Ground',
 'Type_1_First_Ice',
 'Type_1_First_Normal',
 'Type_1_First_Poison',
 'Type_1_First_Psychic',
 'Type_1_First_Rock',
 'Type_1_First_Steel',
 'Type_1_First_Water',
 'Type_2_First_Dark',
 'Type_2_First_Dragon',
 'Type_2_First_Electric',
 'Type_2_First_Fairy',
 'Type_2_First_Fighting',
 'Type_2_First_Fire',
 'Type_2_First_Flying',
 'Type_2_First_Ghost',
 'Type_2_First_Grass',
 'Type_2_First_Ground',
 'Type_2_First_Ice',
 'Type_2_First_Normal',
 'Type_2_First_Poison',
 'Type_2_First_Psychic',
 'Type_2_First_Rock',
 'Type_2_First_S

### 1.4 Create Target and Features
- Create the target (`y`)  from the “Did_the_first_pokemon_win?” column
- create the features (`X`) DataFrame from the remaining columns.

In [9]:
# Separate the y variable, the target
y = df['Did_the_first_pokemon_win']

# Separate the X variable, the features
X = df.copy()
X.drop('Did_the_first_pokemon_win', axis=1, inplace=True)

In [10]:
#verify the features
X.columns.values.tolist()


['HP_First',
 'Attack_First',
 'Defense_First',
 'Sp_Atk_First',
 'Sp_Def_First',
 'Speed_First',
 'HP_Second',
 'Attack_Second',
 'Defense_Second',
 'Sp_Atk_Second',
 'Sp_Def_Second',
 'Speed_Second',
 'Type_1_First_Dark',
 'Type_1_First_Dragon',
 'Type_1_First_Electric',
 'Type_1_First_Fairy',
 'Type_1_First_Fighting',
 'Type_1_First_Fire',
 'Type_1_First_Flying',
 'Type_1_First_Ghost',
 'Type_1_First_Grass',
 'Type_1_First_Ground',
 'Type_1_First_Ice',
 'Type_1_First_Normal',
 'Type_1_First_Poison',
 'Type_1_First_Psychic',
 'Type_1_First_Rock',
 'Type_1_First_Steel',
 'Type_1_First_Water',
 'Type_2_First_Dark',
 'Type_2_First_Dragon',
 'Type_2_First_Electric',
 'Type_2_First_Fairy',
 'Type_2_First_Fighting',
 'Type_2_First_Fire',
 'Type_2_First_Flying',
 'Type_2_First_Ghost',
 'Type_2_First_Grass',
 'Type_2_First_Ground',
 'Type_2_First_Ice',
 'Type_2_First_Normal',
 'Type_2_First_Poison',
 'Type_2_First_Psychic',
 'Type_2_First_Rock',
 'Type_2_First_Steel',
 'Type_2_First_Water',


In [11]:
#verify y
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Did_the_first_pokemon_win, dtype: int64

In [12]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,Speed_First,HP_Second,Attack_Second,Defense_Second,Sp_Atk_Second,...,Tier_Second_NUBL,Tier_Second_OU,Tier_Second_PU,Tier_Second_PUBL,Tier_Second_RU,Tier_Second_RUBL,Tier_Second_UU,Tier_Second_UUBL,Tier_Second_Uber,Tier_Second_Untiered
0,50,48,43,46,41,60,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
1,45,49,49,65,65,45,50,48,43,46,...,0,0,0,0,0,0,0,0,0,0
2,70,94,50,94,50,66,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
3,45,49,49,65,65,45,70,94,50,94,...,0,0,0,0,0,0,0,0,0,1
4,58,64,58,80,65,80,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0


### 1.5 Check the balance of the target variable (`y`) by using the `value_counts` function.

In [13]:
# Check the balance of our target values
y.value_counts()

#This is evenly balanced, as expected
#we do not need to over- or underfit

1    43362
0    43362
Name: Did_the_first_pokemon_win, dtype: int64

### 1.6 Split the data into training and testing datasets by using `train_test_split`.


In [14]:
# Split the data using train_test_split
# Assign a random_state of 78 to the function

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

### 1.7 Scale the data

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [15]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [16]:
#view format of scaler results
print('Scaler results:', scaler.transform(X_train)[:1])


Scaler results: [[ 1.26478336  0.70079508  0.85235583  0.96112984  1.06876072  1.1732114
   0.06482748  1.33969138  0.84675935  2.41917616  1.24783046 -1.64260449
  -0.20838894 -0.20013907 -0.22484397 -0.14815375 -0.18876998 -0.26179837
  -0.05223698 -0.17449772 -0.30819079 -0.19834569 -0.1849809  -0.37643879
  -0.19674932  3.76177397 -0.25710682 -0.1921077  -0.41641899 -0.16608506
  -0.14744775 -0.09616605 -0.177377   -0.17829707 -0.11911869 -0.37176581
  -0.11918505  6.75468774 -0.21912491 -0.1234263  -0.07173588 -0.21162125
  -0.20693889 -0.13906461 -0.1676425  -0.1388922   2.45873659 -0.5150776
  -0.41265014 -0.50466573 -0.31687445 -0.24752396 -0.68210149 -0.31971586
  -0.28612625 -0.03804326 -0.26410414 -0.30446764 -0.10354763 -0.26330383
  -0.16258198  3.34643275 -0.153065   -0.18968528 -0.37002246 -0.20710042
  -0.20067858 -0.22679925 -0.14755656 -0.1903801   3.85390022 -0.05194022
  -0.17686917 -0.30977318 -0.19708631 -0.18426852 -0.37906942 -0.19531159
  -0.26755169 -0.2545827

In [17]:
#Save the scaler to an h5 file for Flask to pick up
filename = '../Resources/X_scaler.h5'
with open (filename, 'wb') as file:
    pickle.dump(X_scaler, file)

In [18]:
#save the X_train columns as a variable list and output to h5
X_train_cols = [X_train.columns]

filename = '../Resources/X_train_cols.h5'
with open (filename, 'wb') as file:
    pickle.dump(X_train_cols, file)

## 2. Create a Random Forest Model with the Original Data

###  2.1 Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [19]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=50, random_state=78)

# Fit the model using training data
classifier = model.fit(X_train_scaled, y_train)


In [20]:
#Validate the model w/ training data
#Use method 'score' to return the mean accuracy on the given test data and targets.
classifier.score(X_train, y_train)



0.5551096966622081

In [21]:
#Validate the model w/ testing data
#Use method 'score' to return the mean accuracy on the given test data and targets.
classifier.score(X_test, y_test)



0.5466537521332042

In [22]:
filename = '../Resources/model.h5'
with open (filename, 'wb') as file:
    pickle.dump(model, file)

### 2.2 Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [23]:
# Make a prediction using the testing data
test_predictions = classifier.predict(X_test_scaled)

#put predictions into a dataframe
test_predictions_df = pd.DataFrame({'Predictions': test_predictions, 'Actual': y_test})
test_predictions_df.head(10)

Unnamed: 0,Predictions,Actual
54147,0,0
28780,0,0
54237,1,1
85589,0,0
85781,0,0
14257,0,0
75003,0,0
13080,0,0
55206,0,0
70629,0,0


### 2.3 valuate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [24]:
# Print the balanced_accuracy score of the model
acc_score = balanced_accuracy_score(y_test, test_predictions)
acc_score

0.9320319897754277

In [25]:
# Generate a confusion matrix for the model
confuse_matrix = confusion_matrix(y_test, test_predictions)

confuse_matrix_df = pd.DataFrame(confuse_matrix, index=['actual 0', 'actual 1'], columns=['predicted 0', 'predicted 1'])
confuse_matrix_df

Unnamed: 0,predicted 0,predicted 1
actual 0,10111,686
actual 1,788,10096


In [26]:
# Print the classification report for the model
print(f"Confusion Matrix:\n{confuse_matrix_df}\n\n\nAccuracy Score:\n{acc_score}\n\n\nClassification Report:\n{classification_report(y_test, test_predictions)}")


Confusion Matrix:
          predicted 0  predicted 1
actual 0        10111          686
actual 1          788        10096


Accuracy Score:
0.9320319897754277


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93     10797
           1       0.94      0.93      0.93     10884

    accuracy                           0.93     21681
   macro avg       0.93      0.93      0.93     21681
weighted avg       0.93      0.93      0.93     21681

