<a href="https://colab.research.google.com/github/jorguzz/pokemon-speed-regression/blob/main/Linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading in data and EDA

In [1]:
!pip install opendatasets



In [2]:
pip install legacy-cgi



In [3]:
import pandas as pd
import opendatasets as od
import os
import numpy as np

In [4]:
od.download('https://www.kaggle.com/datasets/abcsds/pokemon')

Skipping, found downloaded files in "./pokemon" (use force=True to force download)


In [5]:
os.listdir('pokemon')

['Pokemon.csv']

In [6]:
df = pd.read_csv('pokemon/Pokemon.csv')

In [7]:
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [8]:
df.isnull().sum()

Unnamed: 0,0
#,0
Name,0
Type 1,0
Type 2,386
Total,0
HP,0
Attack,0
Defense,0
Sp. Atk,0
Sp. Def,0


# Building our Linear Regression functions

In [9]:
def compute_cost(X, y, w, b):
  '''
  Computes the cost given our parameters w, b
  Args:
        X (ndarray): Shape (m,) Input to the model
        y (ndarray): Shape (m,) Label
        w, b (scalar): Parameters of the model

    Returns
        cost_function (float): The cost of using w,b as the parameters for linear regression
               to fit the data points in x and y
  '''
  m = X.shape[0]
  total_cost = 0
  for i in range(m):
    f_wb_i = np.dot(X[i], w) + b
    cost_i = (f_wb_i - y[i])**2
    total_cost += cost_i
  cost_function = total_cost/2 * m
  return cost_function

In [32]:
def compute_gradient(X, y, w, b):
  '''
  Computes the gradients of w,b (How much the cost changes with respect to w,b)
  Args:
        X (ndarray): Shape (m, n) Input to model
        y (ndarray): Shape(m, ) Label
        w, b (scalar): Parameters of the model
  Returns
      dj_dw, dj_db (float): The derivative with respect to w and b (How much the cost changes with respect to w,b)
  '''
  m, n = X.shape
  #We create an array of zeros of shape n because we need to find a gradient for each feature
  dj_dw = np.zeros(n)
  dj_db = 0
  for i in range(m):
    f_wb_i = np.dot(X[i], w) + b
    error_i = f_wb_i - y[i]
    for j in range(n):
      dj_dw[j] += error_i * X[i, j]
    dj_db += error_i

  dj_dw = dj_dw / m
  dj_db = dj_db / m
  return dj_dw, dj_db

In [23]:
def gradient_descent(X, y, w_init, b_init, num_iters, alpha):
  '''
  Performs gradient descent given an initial w,b
  Args:
        X (ndarray): Shape (m, n) Input to model
        y (ndarray): Shape(m, ) Label
        w, b (scalar): Parameters of the model
        num_iters (int): How many times gradient descent is run
        alpha (float): Learning rate
  Returns:
    w, b, cost_history
  '''
  w = w_init.copy()
  b = b_init
  cost_history = []

  for i in range(num_iters):
      dj_dw, dj_db = compute_gradient(X.values, y.values.flatten(), w, b)
      w -= alpha * dj_dw
      b -= alpha * dj_db

      if i % 100 == 0:
        cost = compute_cost(X.values, y.values.flatten(), w, b)
        cost_history.append(cost)
        print(f"Iteration {i:4}: Cost {cost:.4f}, w: {w}, b: {b:.4f}")

  return w, b, cost_history

In [24]:
def predict(X, w, b):
  y_hat = np.dot(X, w) + b
  return y_hat

# Using functions to predict a Pokemon's Speed

In [25]:
X = df.iloc[:, 5:9]

In [26]:
Y = df[['Speed']]

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
w_in = np.zeros(X_train.shape[1])

In [52]:
w_final, b_final, cost_history = gradient_descent(X_train, y_train, w_in, 7, 2000, 0.0000008)

Iteration    0: Cost 901902864.4859, w: [0.00351526 0.00412247 0.00357791 0.0039036 ], b: 7.0000
Iteration  100: Cost 173339375.4880, w: [0.15667863 0.18798975 0.15189072 0.18325618], b: 7.0023
Iteration  200: Cost 155541261.6669, w: [0.17687958 0.2186637  0.16040957 0.2205958 ], b: 7.0028
Iteration  300: Cost 153196654.3489, w: [0.17765936 0.22681832 0.1492647  0.23648433], b: 7.0030
Iteration  400: Cost 151403385.2241, w: [0.17562681 0.23156796 0.13597891 0.24859667], b: 7.0032
Iteration  500: Cost 149817324.9261, w: [0.17321337 0.2357106  0.12307082 0.25955719], b: 7.0033
Iteration  600: Cost 148408711.4218, w: [0.17077671 0.23965791 0.11086725 0.26978663], b: 7.0035
Iteration  700: Cost 147157114.8245, w: [0.16837085 0.24347254 0.099377   0.27938112], b: 7.0037
Iteration  800: Cost 146044605.7121, w: [0.1660052  0.24716612 0.08856472 0.28838722], b: 7.0039
Iteration  900: Cost 145055338.6452, w: [0.1636824  0.25074279 0.0783908  0.29684219], b: 7.0041
Iteration 1000: Cost 144175304

In [53]:
print(w_final, b_final)

[ 0.14121388  0.28316998 -0.00078795  0.36179254] 7.0061061822213535


In [54]:
y_predict = predict(X_test, w_final, b_final)

In [55]:
print(y_predict)

[ 42.67033718  54.9440841   44.98845608 104.93402098  40.83679057
  46.90144206  95.89546182  37.59602134  80.12156334  45.66736001
  73.93555314  47.09019956  46.08324129  97.71049252  69.69240041
  65.86690309  45.06467255  54.76253761  48.97292344  49.02290412
  66.35253673  50.59267348  74.27285144  71.57848376  48.94423235
  40.69894724  48.47852049  63.87241688  62.21709232  31.94004395
  56.61863187  71.40105096  78.06119955  26.93453267  75.30875178
  93.35588875  53.9007814   53.79675813  40.82871343  42.47342398
  53.77031194  76.40863396  56.76044017  84.21432334  63.14337752
  65.08787109  55.76537205  85.54495158  57.22261739  59.88478847
  50.63129011  75.02383061  56.63091323  85.07075067  63.78807491
  71.47314113  62.45917213  67.21024764  84.75851906  89.66957052
  51.9389285   68.43292319  69.03080095  51.30345787  78.44486023
  66.64683846  45.58105008  47.77245109  45.85044091  69.8371825
  53.55583225  80.87074125  54.69418322 100.32265112  47.60887288
  74.919380