# Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

# Import the datasets

In [25]:
peopleFilepath = './Data/pickledPeople.pkl'
voteFilepath = './Data/pickledVotes_unpacked.pkl'
routeFilepath = './Data/pickledRoutes_unpacked.pkl'

people = pd.read_pickle(peopleFilepath)
votes = pd.read_pickle(voteFilepath)
routes = pd.read_pickle(routeFilepath)

## Function to find common people for an area

In [10]:
# Function to find all common people between N most-popular routes
def findCommonPeople(areaUrl, breadcrumbTier, nRoutes=10, routes=routes, people=people):
    areaRoutes = routes.loc[routes[breadcrumbTier]==areaUrl]
    areaRoutes = areaRoutes.sort_values('numQualityVotes', ascending=False)
    areaRoutes = areaRoutes[:nRoutes]
    
    subPeople = set([])
    
    # Filter the people dataset to only include those with at least nRoutes starRatings
    people = people.loc[people['numStarRatings']>=nRoutes*5]
            
    for i in range(len(areaRoutes)):
        tempSet=[]
        print("Parsing peopleDF for route", i+1, "of", nRoutes)
        for j in range(len(people)):
            if areaRoutes.index[i] in people.iloc[j]['starRatings']:
                tempSet.append(people.index[j])
                print(j/len(people), end='\r')        
        if i==0:
            subPeople=set(tempSet)
        else:
            subPeople = subPeople.intersection(set(tempSet))
            
    print(len(subPeople),"have climbed the", nRoutes, "most-rated climbs at", areaUrl) 

    return subPeople

## Isolate the People to be used for classification

In [54]:
eldoUrl = 'https://www.mountainproject.com/area/105744246/eldorado-canyon-sp'
subpeople = findCommonPeople(eldoUrl, 'bc3')


Parsing peopleDF for route 1 of 10


0.00025833118057349520.00051666236114699040.00077499354172048570.00090415913200723320.00193748385430121420.00206664944458796180.00219581503487470930.00245414621544820480.0030999741668819430.00413329888917592350.00452079566003616650.00490829243089640960.0060707827434771380.0065874451046241280.0077499354172048570.0078791010074916040.0080082665977783510.00813743218806510.0083957633686385940.0100749160423663130.0104624128132265560.0105915784035133030.01084990958408680.0109790751743735460.011366571945233790.0114957375355205370.0116249031258072850.0125290622578145190.0126582278481012660.0131748902092482560.0134332213898217520.0136915525703952460.0154998708344097140.015629036424696460.0158873676052699550.01627486437613020.016791526737277190.0171790235081374330.017954017049857920.0180831826401446660.0188581761818651520.0197623353138723850.0214414879876001050.0218289847584603460.0226039783001808320.0235081374321880650.0236373030224748120.0237664686

Parsing peopleDF for route 2 of 10
Parsing peopleDF for route 3 of 10
Parsing peopleDF for route 4 of 10
Parsing peopleDF for route 5 of 10
Parsing peopleDF for route 6 of 10
Parsing peopleDF for route 7 of 10
Parsing peopleDF for route 8 of 10
Parsing peopleDF for route 9 of 10
Parsing peopleDF for route 10 of 10
95 have climbed the 10 most-rated climbs at https://www.mountainproject.com/area/105744246/eldorado-canyon-sp


## Isolate the Routes to be used for classification

In [55]:
eldo = routes.loc[routes['bc3']=='https://www.mountainproject.com/area/105744246/eldorado-canyon-sp']
eldo = eldo.sort_values(by='numQualityVotes', ascending=False)
eldo = eldo[:10]

## Separate the Training and Testing dataset

In [138]:
# Make the training dataset
train = people.loc[subpeople[0]]
for i in range(1, int(len(subpeople)/2)):
    train = pd.concat([train, people.loc[subpeople[i]]], axis=1)
train = train.transpose()

# Make the test dataset
test = people.loc[subpeople[int(len(subpeople)/2)]]
for i in range(int(len(subpeople)/2)+1, len(subpeople)):
    test = pd.concat([test, people.loc[subpeople[i]]], axis=1)
test = test.transpose()

# All together, just in case it is needed
total = pd.concat([test, train])

### Visualization to see general star distribution

In [140]:
# Visualization to see general star distribution
d = {0:0, 1:0, 2:0, 3:0, 4:0}
r = list(eldo.index)
for i in range(len(total)):
    for j in range(len(eldo)):
        print(total.iloc[i]['starRatings'][r[j]], end=' ')
        d[total.iloc[i]['starRatings'][r[j]]] = d[total.iloc[i]['starRatings'][r[j]]]+1
    print()
    
for i in d:
    print(i, ":", d[i])

4 4 3 4 2 4 4 3 4 4 
4 4 4 3 4 3 4 4 3 4 
4 3 4 3 3 4 3 3 3 3 
3 4 4 3 3 3 4 4 2 4 
4 4 4 4 3 3 3 4 4 4 
3 3 4 2 2 3 2 2 2 3 
4 4 4 3 3 4 4 4 4 2 
3 3 4 4 2 3 3 3 3 4 
2 3 4 4 3 4 4 3 3 4 
3 3 2 3 2 3 3 2 2 3 
4 3 3 4 3 4 4 4 3 3 
4 3 4 4 3 3 4 4 4 4 
4 3 4 4 2 3 4 3 3 3 
3 4 3 3 3 3 4 4 3 4 
4 4 4 4 4 4 4 4 4 4 
3 3 4 3 3 3 4 4 3 3 
2 4 4 3 3 3 4 3 4 2 
4 4 4 4 3 3 4 4 3 3 
4 4 4 3 3 4 4 4 3 3 
3 4 4 2 3 4 4 4 4 4 
2 4 4 3 3 2 4 4 4 3 
4 4 4 4 3 3 4 4 3 4 
4 3 4 4 3 3 4 3 3 2 
3 3 4 2 3 3 4 4 3 4 
4 3 4 3 3 3 4 4 3 3 
3 3 4 2 3 3 4 3 3 3 
3 4 4 3 3 4 4 4 4 4 
4 4 4 3 3 4 4 4 4 4 
4 4 2 3 3 4 4 3 3 4 
4 4 4 4 3 3 4 4 4 4 
4 3 4 4 4 4 4 4 4 4 
4 4 4 3 3 3 3 3 3 3 
4 4 4 3 3 3 4 3 4 3 
3 3 4 3 3 3 3 3 3 3 
3 4 4 3 3 4 4 4 4 4 
4 4 4 2 2 3 4 3 2 3 
4 4 4 3 2 4 4 4 4 4 
3 4 3 2 3 3 4 4 2 3 
4 4 4 4 4 3 4 4 4 4 
4 4 4 3 4 4 4 3 4 4 
4 3 4 2 3 3 4 2 2 3 
4 4 4 3 3 4 4 4 4 4 
4 3 2 4 3 3 3 4 4 3 
2 3 3 2 3 3 4 3 4 4 
4 4 4 3 4 3 4 4 2 4 
4 3 4 3 2 3 4 3 4 3 
4 4 4 3 3 4 3 4 3 4 
4 3 4 4 3 4 4

In [133]:
test.iloc[0]['starRatings']['https://www.mountainproject.com/route/105748490/the-bastille-crack']

4