## Nearest Neighbor
We will project our data onto a space and then use the nearest neighbor algorithm to see if we can classify distinct clusters as certain types.

In [23]:
## code
import pandas as pd
import math

In [2]:
data = pd.read_csv('pokemon_final.csv')
data.head()

Unnamed: 0,id,Name,Type 1,Type 2,Generation,height,weight,base_experience,hp,attack,defense,speed,special-attack,special-defense
0,1,bulbasaur,grass,poison,1,0.7,6.9,64,45,49,49,45,65,65
1,2,ivysaur,grass,poison,1,1.0,13.0,142,60,62,63,60,80,80
2,3,venusaur,grass,poison,1,2.0,100.0,236,80,82,83,80,100,100
3,4,charmander,fire,,1,0.6,8.5,62,39,52,43,65,60,50
4,5,charmeleon,fire,,1,1.1,19.0,142,58,64,58,80,80,65


In [3]:
types = []
for pType in data['Type 1']:
    try:
        index = types.index(pType)
    except ValueError:
        types.append(pType)

for pType in data['Type 2']:
    try:
        index = types.index(pType)
    except ValueError:
        types.append(pType)
        

print(types)

['grass', 'fire', 'water', 'bug', 'normal', 'poison', 'electric', 'ground', 'fairy', 'fighting', 'psychic', 'rock', 'ghost', 'ice', 'dragon', 'dark', 'steel', 'flying', nan]


In [4]:
gen = []
for eachGen in data['Generation']:
    try:
        index = gen.index(eachGen)
    except ValueError:
        gen.append(eachGen)
        
print(gen)

[1, 2, 3, 4, 5, 6, 7]


## Normalizaton

In [39]:
newData = data.copy()
newData.head()

Unnamed: 0,id,Name,Type 1,Type 2,Generation,height,weight,base_experience,hp,attack,defense,speed,special-attack,special-defense
0,1,bulbasaur,grass,poison,1,0.7,6.9,64,45,49,49,45,65,65
1,2,ivysaur,grass,poison,1,1.0,13.0,142,60,62,63,60,80,80
2,3,venusaur,grass,poison,1,2.0,100.0,236,80,82,83,80,100,100
3,4,charmander,fire,,1,0.6,8.5,62,39,52,43,65,60,50
4,5,charmeleon,fire,,1,1.1,19.0,142,58,64,58,80,80,65


### ColumnAverage

In [6]:
variables = ['Generation', 'height', 'weight', 'base_experience', 
             'hp', 'attack', 'defense', 'speed', 'special-attack', 'special-defense']


In [40]:
for column in variables:
    colAvg = 0
    newData[column] = data[column].astype('float64')

    for value in newData[column]:
        colAvg += value
    
    colAvg = colAvg / newData.shape[0]
    print(colAvg)
    for i, row in newData.iterrows():
        newData.at[i, column] = newData.at[i, column] / colAvg

newData.head()

3.714993804213135
1.1624535315985127
61.771127633209396
144.84882280049567
68.74845105328377
76.08674101610904
71.72614622057002
65.83023543990086
69.48698884758365
70.01363073110285


Unnamed: 0,id,Name,Type 1,Type 2,Generation,height,weight,base_experience,hp,attack,defense,speed,special-attack,special-defense
0,1,bulbasaur,grass,poison,0.269179,0.602175,0.111703,0.44184,0.65456,0.644002,0.683154,0.683576,0.935427,0.928391
1,2,ivysaur,grass,poison,0.269179,0.860249,0.210454,0.980332,0.872747,0.814859,0.878341,0.911435,1.151295,1.142635
2,3,venusaur,grass,poison,0.269179,1.720499,1.618879,1.629285,1.163663,1.077717,1.157179,1.215247,1.439118,1.428293
3,4,charmander,fire,,0.269179,0.51615,0.137605,0.428032,0.567286,0.683431,0.599502,0.987388,0.863471,0.714147
4,5,charmeleon,fire,,0.269179,0.946274,0.307587,0.980332,0.843655,0.841145,0.808631,1.215247,1.151295,0.928391


## Euclidean Distance Function

In [53]:
def calcEuclid(row1, row2):
    eDist = 0
    d = []
    for i in range(4, 14):
        d.append(pow(row1[i] - row2[i], 2))
    
    for num in d:
        eDist += num
    return math.sqrt(eDist)

In [80]:
# Find k closest rows to the given row via euclidean dist.
def kNN(row, k):
    # Find closest row and append it to list of k rows
    #  remove row from the table
    #  repeat until we have top k closest rows
    testTable = fake.copy()
    kNear = []
    while (len(kNear) < k):
        currMin = 100.0
        minIndex = 0
        for i, currRow in testTable.iterrows():
            eDist = calcEuclid(row, currRow)
            if (eDist < currMin):
                currMin = eDist
                minIndex = i
        # Drop row and append to kNear
        kNear.append(testTable.iloc[minIndex])
        testTable = testTable.drop(testTable.index[minIndex])
        
    print(kNear)
    

test = kNN(newData.iloc[0], 3)

[id                       44
Name                  gloom
Type 1                grass
Type 2               poison
Generation         0.269179
height               0.6882
weight             0.139224
base_experience    0.952717
hp                 0.872747
attack             0.854288
defense            0.975934
speed              0.607624
special-attack      1.22325
special-defense     1.07122
Name: 43, dtype: object, id                        45
Name               vileplume
Type 1                 grass
Type 2                poison
Generation          0.269179
height                1.0323
weight              0.301112
base_experience      1.52573
hp                   1.09093
attack               1.05143
defense              1.18506
speed               0.759529
special-attack       1.58303
special-defense      1.28546
Name: 44, dtype: object, id                        46
Name                   paras
Type 1                   bug
Type 2                 grass
Generation          0.269179
height

In [75]:
fake = newData.copy()
fake = fake.drop([0])
fake.head()

Unnamed: 0,id,Name,Type 1,Type 2,Generation,height,weight,base_experience,hp,attack,defense,speed,special-attack,special-defense
1,2,ivysaur,grass,poison,0.269179,0.860249,0.210454,0.980332,0.872747,0.814859,0.878341,0.911435,1.151295,1.142635
2,3,venusaur,grass,poison,0.269179,1.720499,1.618879,1.629285,1.163663,1.077717,1.157179,1.215247,1.439118,1.428293
3,4,charmander,fire,,0.269179,0.51615,0.137605,0.428032,0.567286,0.683431,0.599502,0.987388,0.863471,0.714147
4,5,charmeleon,fire,,0.269179,0.946274,0.307587,0.980332,0.843655,0.841145,0.808631,1.215247,1.151295,0.928391
5,6,charizard,fire,flying,0.269179,1.462424,1.465086,1.6569,1.134571,1.104003,1.08747,1.519059,1.568639,1.214049
