In [2]:
# Import the modules
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle
import sqlite3


## 1. Split the Data into Training and Testing Sets

### 1.1 Read the preproccessed Pokemon data from the `Resources` folder into a Pandas DataFrame.

In [20]:

# Making a connection between sqlite3
# database and Python Program
cnx = sqlite3.connect('Resources/pokemon.sqlite')
     

# Getting all tables from sqlite_master
sql_query = """SELECT name FROM sqlite_master
WHERE type='table';"""
 
# Creating cursor object using connection object
cursor = cnx.cursor()
     
# executing our sql query
cursor.execute(sql_query)
print("List of tables:")
     
# printing all tables list
print(cursor.fetchall())


List of tables:
[('pokemon',)]


In [21]:
#Turn the table into a pandas dataframe
df = pd.read_sql_query("SELECT * FROM pokemon", cnx)

# Review the DataFrame
df.head()

Unnamed: 0,_ID,Number_First,First_Name,Type_1_First,Type_2_First,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,...,Attack_Second,Defense_Second,Sp_Atk_Second,Sp_Def_Second,Speed_Second,Generation_Second,Legendary_Second,Tier_Second,Battle_number,Did_the_first_pokemon_win
0,65,372,Barboach,Water,Ground,50,48,43,46,41,...,49,49,65,65,45,1,0,LC,1,1
1,42104,1,Bulbasaur,Grass,Poison,45,49,49,65,65,...,48,43,46,41,60,3,0,LC,1,0
2,79,462,Mothim,Bug,Flying,70,94,50,94,50,...,49,49,65,65,45,1,0,LC,2,1
3,51724,1,Bulbasaur,Grass,Poison,45,49,49,65,65,...,94,50,94,50,66,4,0,Untiered,2,0
4,0,6,Charmeleon,Fire,,58,64,58,80,65,...,49,49,65,65,45,1,0,LC,3,1


In [22]:
#Close the connection to the sqlite db
cnx.close()

In [23]:
#print columns
df.columns

Index(['_ID', 'Number_First', 'First_Name', 'Type_1_First', 'Type_2_First',
       'HP_First', 'Attack_First', 'Defense_First', 'Sp_Atk_First',
       'Sp_Def_First', 'Speed_First', 'Generation_First', 'Legendary_First',
       'Tier_First', 'Number_Second', 'Second_Name', 'Type_1_Second',
       'Type_2_Second', 'HP_Second', 'Attack_Second', 'Defense_Second',
       'Sp_Atk_Second', 'Sp_Def_Second', 'Speed_Second', 'Generation_Second',
       'Legendary_Second', 'Tier_Second', 'Battle_number',
       'Did_the_first_pokemon_win'],
      dtype='object')

In [24]:
#drop the columns that are not useful for the ML mdoel
df.drop(['_ID', 'Number_First', 'First_Name', 'Number_Second', 'Second_Name', 'Battle_number'], axis=1, inplace=True)

df.head()

Unnamed: 0,Type_1_First,Type_2_First,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,Speed_First,Generation_First,Legendary_First,...,HP_Second,Attack_Second,Defense_Second,Sp_Atk_Second,Sp_Def_Second,Speed_Second,Generation_Second,Legendary_Second,Tier_Second,Did_the_first_pokemon_win
0,Water,Ground,50,48,43,46,41,60,3,0,...,45,49,49,65,65,45,1,0,LC,1
1,Grass,Poison,45,49,49,65,65,45,1,0,...,50,48,43,46,41,60,3,0,LC,0
2,Bug,Flying,70,94,50,94,50,66,4,0,...,45,49,49,65,65,45,1,0,LC,1
3,Grass,Poison,45,49,49,65,65,45,1,0,...,70,94,50,94,50,66,4,0,Untiered,0
4,Fire,,58,64,58,80,65,80,1,0,...,45,49,49,65,65,45,1,0,LC,1


In [25]:
#Create dummies
df = pd.get_dummies(data=df, columns=[
    'Type_1_First','Type_2_First','Generation_First','Legendary_First','Tier_First',
    'Type_1_Second','Type_2_Second','Generation_Second','Legendary_Second', 'Tier_Second'], drop_first=True)

# Display sample data
df.head()

Unnamed: 0,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,Speed_First,HP_Second,Attack_Second,Defense_Second,Sp_Atk_Second,...,Tier_Second_NUBL,Tier_Second_OU,Tier_Second_PU,Tier_Second_PUBL,Tier_Second_RU,Tier_Second_RUBL,Tier_Second_UU,Tier_Second_UUBL,Tier_Second_Uber,Tier_Second_Untiered
0,50,48,43,46,41,60,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
1,45,49,49,65,65,45,50,48,43,46,...,0,0,0,0,0,0,0,0,0,0
2,70,94,50,94,50,66,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
3,45,49,49,65,65,45,70,94,50,94,...,0,0,0,0,0,0,0,0,0,1
4,58,64,58,80,65,80,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df.columns.values.tolist()

#The original columns for the dummies have been dropped automatically

['HP_First',
 'Attack_First',
 'Defense_First',
 'Sp_Atk_First',
 'Sp_Def_First',
 'Speed_First',
 'HP_Second',
 'Attack_Second',
 'Defense_Second',
 'Sp_Atk_Second',
 'Sp_Def_Second',
 'Speed_Second',
 'Did_the_first_pokemon_win',
 'Type_1_First_Dark',
 'Type_1_First_Dragon',
 'Type_1_First_Electric',
 'Type_1_First_Fairy',
 'Type_1_First_Fighting',
 'Type_1_First_Fire',
 'Type_1_First_Flying',
 'Type_1_First_Ghost',
 'Type_1_First_Grass',
 'Type_1_First_Ground',
 'Type_1_First_Ice',
 'Type_1_First_Normal',
 'Type_1_First_Poison',
 'Type_1_First_Psychic',
 'Type_1_First_Rock',
 'Type_1_First_Steel',
 'Type_1_First_Water',
 'Type_2_First_Dark',
 'Type_2_First_Dragon',
 'Type_2_First_Electric',
 'Type_2_First_Fairy',
 'Type_2_First_Fighting',
 'Type_2_First_Fire',
 'Type_2_First_Flying',
 'Type_2_First_Ghost',
 'Type_2_First_Grass',
 'Type_2_First_Ground',
 'Type_2_First_Ice',
 'Type_2_First_Normal',
 'Type_2_First_Poison',
 'Type_2_First_Psychic',
 'Type_2_First_Rock',
 'Type_2_First_S

### 1.2 Create Target and Features
- Create the target (`y`)  from the “Did_the_first_pokemon_win?” column
- create the features (`X`) DataFrame from the remaining columns.

In [11]:
# Separate the y variable, the target
y = df['Did_the_first_pokemon_win']

# Separate the X variable, the features
X = df.copy()
X.drop('Did_the_first_pokemon_win', axis=1, inplace=True)

In [13]:
#verify y
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Did_the_first_pokemon_win, dtype: int64

In [12]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,HP_First,Attack_First,Defense_First,Sp_Atk_First,Sp_Def_First,Speed_First,HP_Second,Attack_Second,Defense_Second,Sp_Atk_Second,...,Tier_Second_NUBL,Tier_Second_OU,Tier_Second_PU,Tier_Second_PUBL,Tier_Second_RU,Tier_Second_RUBL,Tier_Second_UU,Tier_Second_UUBL,Tier_Second_Uber,Tier_Second_Untiered
0,50,48,43,46,41,60,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
1,45,49,49,65,65,45,50,48,43,46,...,0,0,0,0,0,0,0,0,0,0
2,70,94,50,94,50,66,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0
3,45,49,49,65,65,45,70,94,50,94,...,0,0,0,0,0,0,0,0,0,1
4,58,64,58,80,65,80,45,49,49,65,...,0,0,0,0,0,0,0,0,0,0


### 1.3 Check the balance of the target variable (`y`) by using the `value_counts` function.

In [14]:
# Check the balance of our target values
y.value_counts()

#This is evenly balanced, as expected
#we do not need to over- or underfit

1    43362
0    43362
Name: Did_the_first_pokemon_win, dtype: int64

### 1.4 Split the data into training and testing datasets by using `train_test_split`.


In [17]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
#adding 'stratify' to ensure that the random sample matches the demos of the total sample

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)

### 1.5 Scale the data

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [18]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#Save the scaler to an h5 file for Flask to pick up

## 2. Create a Logistic Regression Model with the Original Data

###  2.1 Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
#Validate the model w/ training data
#Use method 'score' to return the mean accuracy on the given test data and targets.

classifier.score(X_train, y_train)

In [None]:
#Validate the model w/ testing data
#Use method 'score' to return the mean accuracy on the given test data and targets.
classifier.score(X_test, y_test)

### 2.2 Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
#code

### 2.3 valuate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
#code

## 3. Save and Reload the Model using Pickle

In [None]:
#Reference
#https://www.geeksforgeeks.org/saving-a-machine-learning-model/

#save the model with pickle
#pickle.dump serializes an object hierarchy

#CHRISTIN define model above
saved_model = pickle.dumps(model, open('model.pkl', 'wb'))

In [None]:
#load the pickle model
pickled_model = pickle.loads(open('model.pkl', 'rb'))

#use the loaded model to make predictions
pickled_model.predict(X_test)