## Similar SkiBoard Finder
---
Find Skis or Snowboards that are similar to another in a specified size
- provide skiboard + size
- request X similar skiboards
- return results as list

In [1008]:
import json
import os, re, math
import statistics
import pymysql
import numpy as np
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import euclidean

In [1009]:
# Database Setup
# --------------------------------------------------
def setupdb():
    f = open('../app/config/localdb_config.json')
    dbconfig = json.loads(f.read())
    db = pymysql.connect(host=dbconfig['localhost'], user=dbconfig['username'], password=dbconfig['password'], database=dbconfig['database'])
    f.close()
    return db

### Collect Skiboards from DB
---

In [1010]:
def get_skiboards():
    db = setupdb()
    cursor = db.cursor()

    sql = 'SELECT * FROM Sizes LEFT JOIN SkiBoards on SkiBoards.skiboard_id = Sizes.skiboard_id'

    # Search for SkiBoards with 
    try:
        cursor.execute(sql)
        response = cursor.fetchall()
        skiboards = []

        
        for r in response:
            #print(f'{r}\n\n')
            skiboards.append({
                'id': r[0],
                'name': f'{r[15]} {r[16]}',
                'year': r[17],
                'type': r[20],
                'size': r[1],
                'stiffness': r[23],
                'camber_profile': r[26],
                'shape': r[24],
                'nose_width': r[2],
                'waist_width': r[3],
                'tail_width': r[4],
                'sidecut': r[5],
                'effective_edge': r[9]
            })
    except Exception as e:
        print(f'ERROR: {e}')
    
    return pd.DataFrame.from_records(skiboards)

### Manage Sidecut Abnormalities
---

In [1011]:
def condense_sidecuts(skiboards):
    for idx, row in skiboards.iterrows():
        
        try:
            # extract list of numbers from 'sidecut' value
            radius_list = re.findall(r'\d+\.?\d*', str(row['sidecut']))
    
            # convert from str to float val
            radius_list = [float(val) for val in radius_list]
    
            # calc the median radius and weight the average calc with this value
            median_radius = statistics.median(radius_list)
            radius_list.append(median_radius)
    
            # calc a weighted average of the sidecut
            radius_av = round(statistics.mean(radius_list), 1)
    
        except Exception as e:
            print(f'ERROR: {e}')
            print(f'{row}\n')
    
        
        # update the value of sidecut with the weighted average
        skiboards.loc[idx, 'sidecut'] = radius_av

    return skiboards

### Calculate Taper 
---

In [1012]:
def calc_taper(skiboards):
    for idx, row in skiboards.iterrows():
        if not row['nose_width']:
            skiboards.loc[idx, 'nose_width'] = None

        if not row['tail_width']:
            skiboards.loc[idx, 'tail_width'] = None

        if row['nose_width'] and row['tail_width']:
            skiboards.loc[idx, 'taper'] = float(row['nose_width']) - float(row['tail_width'])
            
    return skiboards

### Weigting of normalized values
---

In [1059]:
def normalize(skiboards, keys):

    # normalize dataset
    scaler = MaxAbsScaler()
    norm_data = scaler.fit_transform(skiboards[keys])

    return norm_data

## Find Similar SkiBoards
---

In [1183]:
def find_similar(my_skiboard, skiboards):
    
    # List of parameters with their respective similarity weights
    weights = {
        'nose_width': 1,
        'waist_width': 10,
        'tail_width': 1.5,
        'sidecut': 3,
        'effective_edge': 2,
        'taper': 5
    }
    
    # Normalize sidecut inconsistencies
    # ---------------------------------
    skiboards = condense_sidecuts(skiboards)

    
    # Calculate taper
    # ---------------
    skiboards = calc_taper(skiboards)

    
    # Find the full set index of the comparitor SkiBoard
    # --------------------------------------------------
    my_skiboard_idx = skiboards[(skiboards['id'] == my_skiboard['id'].values[0]) & (skiboards['size'] == my_skiboard['size'].values[0])].index[0]

    
    # Normalize dataset and apply weights
    # -----------------------------------
    norm_data = normalize(skiboards, weights.keys())

    
    # Calculate similarity scores for each SkiBoard
    # ---------------------------------------------
    for i, s in skiboards.iterrows():
        try:
            distance = 0 
            for j, param in enumerate(list(weights.keys())):
                distance = distance + ( weights[param] * ((norm_data[my_skiboard_idx][j] - norm_data[i][j])**2) )
            if distance:
                distance = math.sqrt(distance)
                
            skiboards.loc[i, 'similarity'] = distance
        except Exception as e:
            print(f"{i} / ERROR: {e}")
            skiboards.loc[i, 'similarity'] = None
            
        


    # Normalize Camber dominance of Skiboards
    # ---------------------------------------
    my_camber = str(my_skiboard.camber_profile).lower()
    if my_camber:
        if 'flat' in my_camber:
            my_camber = 'flat'
        elif 'rocker' in my_camber:
            my_camber = 'rocker'
        else:
            my_camber = 'camber'

    '''
    # Normalize shape profile of SkiBoards
    # ------------------------------------
    my_shape = str(my_skiboard.shape).lower()
    if my_shape:
        if 'twin' in my_shape:
            my_shape = 'twinish' # includes directional twin or twintip etc
        else:
            my_shape = 'directional'
    '''
    
    # Adjust normalized similarity scores for each SkiBoard
    # -----------------------------------------------------
    median_similarity = skiboards['similarity'].median()
    
    for i, s in skiboards.iterrows():
        # Adjust for stiffness
        if (float(s.stiffness) < float(my_skiboard.stiffness.values[0]) - 1.5) or (float(s.stiffness) > float(my_skiboard.stiffness.values[0]) + 1.5):
            if skiboards.loc[i, 'similarity'] == 0:
                skiboards.loc[i, 'similarity'] = 0.01
            skiboards.loc[i, 'similarity'] = skiboards.loc[i, 'similarity'] + (0.1 * median_similarity)
        
        # Adjust for camber profile
        # -------------------------
        if my_camber not in str(s.camber_profile).lower():
            if skiboards.loc[i, 'similarity'] == 0:
                skiboards.loc[i, 'similarity'] = 0.01
            skiboards.loc[i, 'similarity'] = skiboards.loc[i, 'similarity'] + (0.2 * median_similarity)

        # Do we need this when we have all other measurements??
        # Maybe this is more useful for skis than boards??
        '''
        # Generalise shape
        # ----------------
        if 'twin' in str(s.shape).lower():
            this_shape = 'twinish'
        else:
            this_shape = 'directional'

        
        # Adjust for shape
        if my_shape != this_shape:
            if skiboards.loc[i, 'similarity'] == 0:
                skiboards.loc[i, 'similarity'] = 0.01
            skiboards.loc[i, 'similarity'] = skiboards.loc[i, 'similarity'] * 1.2
        '''
    
        


    # Using a nultiplier [ * 0.9 ] won't effect exact matches
    # Burton Custom X should not have a 100% similarity with a Burton Custom Flying V
    

    return skiboards  

## Find Versions
---
Finds each version of the exact same SkiBoard (different years of the same thing)

In [1184]:
def find_versions(my_skiboard, skiboards, show_original=False, only_newest=False, remove_newest=False):

    # collect all versions of the same skibaord
    versions = skiboards.loc[skiboards['name'] == my_skiboard['name'].values[0]]
    versions = versions.sort_values(['year', 'size'], ascending=[False, True])

    # filter results to include / exclude newest model
    if only_newest:
        newest_id = versions.iloc[0].id
        versions = versions[versions['id'] == newest_id]
    elif remove_newest:
        newest_id = versions.iloc[0].id
        versions = versions[versions['id'] != newest_id]

    # if the selected skiboard is the newest, an empty dataframe will be returned
    if not show_original:
        versions = versions[versions['id'] != my_skiboard.id.values[0]]
        
    return versions

## Filter SkiBoard Type
---
Remove skiboards of a different type from comparisons (skis should not be compared to splitboards etc.)

In [1185]:
def filter_by_type(my_skiboard, skiboards):
    my_type = my_skiboard['type'].values[0]
    
    return skiboards[skiboards['type'] == my_type]

# Main
---

In [1186]:
skiboards = get_skiboards()

In [1187]:
skiboards.head()

Unnamed: 0,id,name,year,type,size,stiffness,camber_profile,shape,nose_width,waist_width,tail_width,sidecut,effective_edge
0,1,Burton Custom,2020,Snowboard,150,6.0,Full Camber,Directional Twin,288.1,248.0,288.1,7.4,1135.0
1,1,Burton Custom,2020,Snowboard,154,6.0,Full Camber,Directional Twin,291.5,250.0,291.5,7.6,1175.0
2,1,Burton Custom,2020,Snowboard,154W,6.0,Full Camber,Directional Twin,299.5,258.0,299.5,7.6,1175.0
3,1,Burton Custom,2020,Snowboard,156,6.0,Full Camber,Directional Twin,294.3,252.0,294.3,7.8,1195.0
4,1,Burton Custom,2020,Snowboard,158,6.0,Full Camber,Directional Twin,297.0,254.0,297.0,7.9,1215.0


In [1188]:
name = 'Burton Custom'
year = 2020
size = '154W'

my_skiboard = skiboards.loc[(skiboards['name'] == name) & (skiboards['year'] == str(year)) & (skiboards['size'] == size)]
my_skiboard.head()

Unnamed: 0,id,name,year,type,size,stiffness,camber_profile,shape,nose_width,waist_width,tail_width,sidecut,effective_edge
2,1,Burton Custom,2020,Snowboard,154W,6.0,Full Camber,Directional Twin,299.5,258.0,299.5,7.6,1175.0


In [1189]:
skiboards = find_similar(my_skiboard, skiboards)
versions = find_versions(my_skiboard, skiboards, remove_newest=True, show_original=True)

#### Remove exact matches of skiboard from results list

In [1190]:
skiboards = skiboards[~skiboards['id'].isin(versions['id'].tolist())]

In [1191]:
filter_by_type(my_skiboard, skiboards)

Unnamed: 0,id,name,year,type,size,stiffness,camber_profile,shape,nose_width,waist_width,tail_width,sidecut,effective_edge,taper,similarity
0,1,Burton Custom,2020,Snowboard,150,6.0,Full Camber,Directional Twin,288.1,248.0,288.1,7.4,1135.0,0.0,0.129013
1,1,Burton Custom,2020,Snowboard,154,6.0,Full Camber,Directional Twin,291.5,250.0,291.5,7.6,1175.0,0.0,0.094531
2,1,Burton Custom,2020,Snowboard,154W,6.0,Full Camber,Directional Twin,299.5,258.0,299.5,7.6,1175.0,0.0,0.000000
3,1,Burton Custom,2020,Snowboard,156,6.0,Full Camber,Directional Twin,294.3,252.0,294.3,7.8,1195.0,0.0,0.075155
4,1,Burton Custom,2020,Snowboard,158,6.0,Full Camber,Directional Twin,297.0,254.0,297.0,7.9,1215.0,0.0,0.067526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,98,Ride Manic,2020,Snowboard,163,3.0,Directional Hybrid Camber,Twin,305.0,257.0,305.0,8.3,1241.0,0.0,0.122743
817,99,Ride Heartbreaker,2020,Snowboard,139,3.0,Hybrid Camber,Twin,274.0,234.0,274.0,8.7,1076.0,0.0,0.344668
818,99,Ride Heartbreaker,2020,Snowboard,143,3.0,Hybrid Camber,Twin,277.0,236.0,277.0,8.8,1101.0,0.0,0.319163
819,99,Ride Heartbreaker,2020,Snowboard,147,3.0,Hybrid Camber,Twin,281.0,238.0,281.0,8.9,1127.0,0.0,0.294658


In [1192]:
skiboards = skiboards.sort_values(['similarity'], ascending=True)
skiboards.head(20)

Unnamed: 0,id,name,year,type,size,stiffness,camber_profile,shape,nose_width,waist_width,tail_width,sidecut,effective_edge,taper,similarity
2,1,Burton Custom,2020,Snowboard,154W,6.0,Full Camber,Directional Twin,299.5,258.0,299.5,7.6,1175.0,0.0,0.0
548,41,Burton Free Thinker,2020,Snowboard,157w,6.5,Full Camber,Twin,304.3,260.0,304.3,7.5,1195.0,0.0,0.038561
521,35,Burton Kilroy 3D,2020,Snowboard,154,5.0,Hybrid Camber,Twin,296.2,254.0,296.2,7.6,1180.0,0.0,0.04631
741,78,CAPiTA Outerspace Living,2020,Snowboard,155w,5.0,Hybrid Camber,Twin,304.0,260.0,304.0,7.9,1158.0,0.0,0.046528
522,35,Burton Kilroy 3D,2020,Snowboard,158,5.0,Hybrid Camber,Twin,301.7,258.0,301.7,7.8,1220.0,0.0,0.050417
574,45,Burton Custom (Flying V),2020,Snowboard,154w,6.0,Hybrid Rocker,Directional Twin,299.5,258.0,299.5,7.6,1175.0,0.0,0.052291
45,105,Ride Burnout,2020,Snowboard,157w,7.0,Hybrid Camber,Twin,303.0,258.0,303.0,7.8,1221.0,0.0,0.052893
697,70,CAPiTA Super DOA,2020,Snowboard,155w,6.0,Hybrid Camber,Twin,302.0,258.0,302.0,8.1,1166.0,0.0,0.053641
665,63,CAPiTA DOA,2020,Snowboard,155w,5.5,Hybrid Camber,Twin,302.0,258.0,302.0,8.1,1166.0,0.0,0.053641
757,80,CAPiTA Indoor Survival,2020,Snowboard,160,5.0,Hybrid Camber,Twin,302.0,257.0,302.0,8.1,1194.0,0.0,0.057204


# TO DO...
---
- add this file to branch and deploy to master

## Improvements
---
- Factor in stiffness rating
- Consider camber profile? Do we simplify this to camber / rocker dominent
- Scale variables by significance

- Remove duplicate results for all other skiboards
- Weight variables based on significacnce during similarity calc
- How do we handle similarity score for skiboards with missing params?
    -- Do we weight a penalty based on which param is missing?

In [396]:
mystr = "abcb"

In [397]:
def recurring_checker(my_str):
    for idx, c in enumerate(mystr):
        if c in list(mystr)[idx:]:
            return c
    return None

In [398]:
print(recurring_checker(mystr))

a


In [480]:
weights = {
        'nose_width': 1,
        'waist_width': 1,
        'tail_wdith': 1,
        'sidecut': 1,
        'effective_edge': 1,
        'taper': 1,
    }

In [483]:
for w in weights.keys():
    print(w)

nose_width
waist_width
tail_wdith
sidecut
effective_edge
taper
