In [1]:
import pandas as pd
import numpy as np
from lib.plot_jzar import *
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
pd.set_option("display.max_columns",None)
pd.set_option('display.max_rows', 10)

### We have 3 types of data:

- Independent features (clustered)
- `How good the route is (the higher the better)` => <b>In this notebook we will work on this one
- Grade, Location, sex, heigh (defined by user)


# Routes rating

Objective
- We want to convert all the "rating" columns into a single one

In [3]:
routes = pd.read_csv('../data/routes_clustered.csv',low_memory=False, index_col=0)
print(routes.shape)
routes.tail(3)

FileNotFoundError: [Errno 2] No such file or directory: '../data/routes_clustered.csv'

## Prepare the dataframe

We will use the following values
- rating_mean -> the mean value of the route rating
- repeat_rate -> the sum of times that the route have been repeated by the same person divided by ascensions
- recommend_rate -> times that the route has been recommended divided by ascensions
- sentiment_rate -> sentiment count divided by comments count

In [None]:
routes_rate = routes.copy()
routes_rate['recommend_rate'] = routes_rate.recommend_sum / routes_rate.ascents_count

routes_rate = routes_rate[['rating_mean','recommend_rate','sentiment_mean']]
print(routes_rate.shape)
routes_rate.tail(3)

Replace nan by mean

In [None]:
routes_rate.isna().sum()

In [None]:
routes_rate.rating_mean = routes_rate.rating_mean.fillna(np.mean(routes_rate.rating_mean))

In [None]:
routes_rate.rating_mean.hist()

In [None]:
routes_rate.replace([np.inf, -np.inf], np.nan, inplace=True)
routes_rate.sentiment_mean = routes_rate.sentiment_mean.fillna(0)
routes_rate.sentiment_mean.hist()

In [None]:
routes_rate.sentiment_mean.value_counts()

In [None]:
routes_rate.recommend_rate.hist()

 #### Scale them

In [None]:
X = routes_rate.copy()
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
filename = "../transformer/scaler_rating.pickle" # Path with filename
with open(filename, "wb") as file:
        pickle.dump(scaler,file)
X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
print(X_scaled_df.shape)
display(X_scaled_df.head(3))

## Group the 3 features into one

### Check the PCA

In [None]:
pca = PCA(n_components=None)
principalComponents = pca.fit_transform(X_scaled_df)
principalDf = pd.DataFrame(data = principalComponents)
pca.explained_variance_ratio_

We see that we have one component as high as 70 percent. Then we will take this one as the rating main component

#### Check the distribution

In [None]:
%matplotlib notebook
plot3D(x = principalDf[0],y = principalDf[1],z = principalDf[2],color = principalDf[0], fraction = 0.1)

In [None]:
routes['rating_tot'] = principalDf[1]

In [None]:
routes = routes.drop(columns = ['repeat_sum','rating_mean','ascents_count','sentiment_mean','recommend_sum','yellow_id_mean','first_ascent_mean'])
routes.head()

In [None]:
routes.to_csv('../data/routes_rated.csv')

In [None]:
print("Positive rating: ", sum(routes.rating_tot > 0))
print("Negative rating: ", sum(routes.rating_tot < 0))