## Nearest Neighbor
We will project our data onto a space and then use the nearest neighbor algorithm to see if we can classify distinct clusters as certain types.

In [1]:
## code
import pandas as pd
import math

In [2]:
data = pd.read_csv('pokemon_final.csv')
data.head()

Unnamed: 0,id,Name,Type 1,Type 2,Generation,height,weight,base_experience,hp,attack,defense,speed,special-attack,special-defense
0,1,bulbasaur,grass,poison,1,0.7,6.9,64,45,49,49,45,65,65
1,2,ivysaur,grass,poison,1,1.0,13.0,142,60,62,63,60,80,80
2,3,venusaur,grass,poison,1,2.0,100.0,236,80,82,83,80,100,100
3,4,charmander,fire,,1,0.6,8.5,62,39,52,43,65,60,50
4,5,charmeleon,fire,,1,1.1,19.0,142,58,64,58,80,80,65


In [3]:
types = []
for pType in data['Type 1']:
    try:
        index = types.index(pType)
    except ValueError:
        types.append(pType)

# for pType in data['Type 2']:
#     try:
#         index = types.index(pType)
#     except ValueError:
#         types.append(pType)
        
        

print(types)
print(len(types))

['grass', 'fire', 'water', 'bug', 'normal', 'poison', 'electric', 'ground', 'fairy', 'fighting', 'psychic', 'rock', 'ghost', 'ice', 'dragon', 'dark', 'steel', 'flying']
18


In [4]:
gen = []
for eachGen in data['Generation']:
    try:
        index = gen.index(eachGen)
    except ValueError:
        gen.append(eachGen)
        
print(gen)

[1, 2, 3, 4, 5, 6, 7]


## Normalizaton

In [5]:
newData = data.copy()
newData.head()

Unnamed: 0,id,Name,Type 1,Type 2,Generation,height,weight,base_experience,hp,attack,defense,speed,special-attack,special-defense
0,1,bulbasaur,grass,poison,1,0.7,6.9,64,45,49,49,45,65,65
1,2,ivysaur,grass,poison,1,1.0,13.0,142,60,62,63,60,80,80
2,3,venusaur,grass,poison,1,2.0,100.0,236,80,82,83,80,100,100
3,4,charmander,fire,,1,0.6,8.5,62,39,52,43,65,60,50
4,5,charmeleon,fire,,1,1.1,19.0,142,58,64,58,80,80,65


### ColumnAverage

In [6]:
variables = ['Generation', 'height', 'weight', 'base_experience', 
             'hp', 'attack', 'defense', 'speed', 'special-attack', 'special-defense']


In [7]:
for column in variables:
    colAvg = 0
    newData[column] = data[column].astype('float64')

    for value in newData[column]:
        colAvg += value
    
    colAvg = colAvg / newData.shape[0]
#     print(colAvg)
    for i, row in newData.iterrows():
        newData.at[i, column] = newData.at[i, column] / colAvg

newData.head()

Unnamed: 0,id,Name,Type 1,Type 2,Generation,height,weight,base_experience,hp,attack,defense,speed,special-attack,special-defense
0,1,bulbasaur,grass,poison,0.269179,0.602175,0.111703,0.44184,0.65456,0.644002,0.683154,0.683576,0.935427,0.928391
1,2,ivysaur,grass,poison,0.269179,0.860249,0.210454,0.980332,0.872747,0.814859,0.878341,0.911435,1.151295,1.142635
2,3,venusaur,grass,poison,0.269179,1.720499,1.618879,1.629285,1.163663,1.077717,1.157179,1.215247,1.439118,1.428293
3,4,charmander,fire,,0.269179,0.51615,0.137605,0.428032,0.567286,0.683431,0.599502,0.987388,0.863471,0.714147
4,5,charmeleon,fire,,0.269179,0.946274,0.307587,0.980332,0.843655,0.841145,0.808631,1.215247,1.151295,0.928391


## Euclidean Distance Function

In [8]:
def calcEuclid(row1, row2):
    eDist = 0
    d = []
#     for i in range(4, 14):
#         d.append(pow(row1[i] - row2[i], 2))
    d.append(pow(row1[5] - row2[5], 2))
    d.append(pow(row1[6] - row2[6], 2))
    for num in d:
        eDist += num
    return pow(eDist, 1/2)

print(calcEuclid(newData.iloc[0], newData.iloc[547]))
print(newData.iloc[547])

0.1721184212024331
id                      548
Name                petilil
Type 1                grass
Type 2                  NaN
Generation           1.3459
height             0.430125
weight             0.106846
base_experience     0.38661
hp                  0.65456
attack             0.460001
defense            0.697096
speed              0.455718
special-attack      1.00738
special-defense    0.714147
Name: 547, dtype: object


## Manhattan Distance Function

In [9]:
def calcMan(row1, row2):
    man = 0
    for i in range (4, 14):
        man += abs(row1[i] - row2[i])
    return man

## KNN Finder

In [25]:
# Find k closest rows to the given row via euclidean dist.
def kNN(row, removeIndex, k):
    # Find closest row and append it to list of k rows
    #  remove row from the table
    #  repeat until we have top k closest rows

    # Create a new table, removing the current tested row
    testTable = newData.copy()
    testTable = testTable.drop([removeIndex])
    
    kNear = []
    while (len(kNear) < k):
        currMin = 100.0
        minIndex = 0
        for i, currRow in testTable.iterrows():
            eDist = calcEuclid(row, currRow)
            if (eDist < currMin):
                currMin = eDist.copy()
                minIndex = i
        # Drop row and append to kNear
        kNear.append((testTable.iloc[minIndex], currMin))
#         print(testTable.index[minIndex])
#         print(testTable.iloc[minIndex])
        testTable = testTable.drop(minIndex)
    
    # list = [(pokemon, distance)]
    # list[indexInList][0 -> pokemon, 1 -> distance]
    return kNear
    

# test = kNN(newData.iloc[0], 5)
# print(newData.iloc[0])
# # for n in test:
# #     print(n)

## Classification Calculation

In [44]:
def typeClass(row, guess):
    if (row['Type 1'] == guess or row['Type 2'] == guess):
        return True
    return False

In [11]:
# Calculate classification based on closest neighbor Type 1
def classifyNearestNeighbor(row, neighbors):
    if (neighbors[0][0]['Type 1'] == row['Type 1'] or neighbors[0][0]['Type 1'] == row['Type 2']):
        return 1
    return 0
    
accuracy = classifyNearestNeighbor(newData.iloc[0], kNN(newData.iloc[0], 3))
print(accuracy)

1


In [41]:
# Calculate classification based on weighting scheme of 1/d
def classifyPopularType(row, neighbors):
    typeD = {}
    for pokemon in neighbors:
        if pokemon[0]['Type 1'] not in typeD:
            typeD[pokemon[0]['Type 1']] = 1
        else:
            typeD[pokemon[0]['Type 1']] += 1
            
        if pokemon[0]['Type 2'] not in typeD:
            typeD[pokemon[0]['Type 2']] = 1
        else:
            typeD[pokemon[0]['Type 2']] += 1
        
    for x, y in typeD.items():
        popular = (x, y)
        return x


In [46]:
testIndex = 0
testK = 5
neighbors = kNN(newData.iloc[testIndex], testIndex, testK)
classType = classifyPopularType(newData.iloc[testIndex], neighbors)
print(typeClass(newData.iloc[testIndex], classType))

True
