In [1]:
import numpy as np 
import pandas as pd

from imblearn.over_sampling import SMOTE, SMOTENC

import sys
import os 

sys.path.append('..')
from src import config

if sys.platform == 'linux':
    path = config.LINUX_PATH
else:
    path = config.OS_PATH
os.chdir(path)

data_path = config.STRAVA_TRAIN_PATH
data = pd.read_csv(data_path, index_col=0)
print(data.columns.tolist())
data.head()

['name', 'distance', 'moving_time', 'total_elevation_gain', 'workout_type', 'timezone', 'achievement_count', 'kudos_count', 'manual', 'max_speed', 'average_heartrate', 'max_heartrate', 'pr_count', 'total_photo_count', 'suffer_score', 'has_photo', 'is_named', 'GMT_date', 'GMT_time', 'local_date', 'local_time', 'run_per_day', 'max_run', 'datetime', 'year', 'weekofyear', 'month', 'dayofweek', 'weekend', 'hour', 'is_uk_awake', 'run_area', 'latlng_cluster', 'city', 'average_speed_mpk']


Unnamed: 0,name,distance,moving_time,total_elevation_gain,workout_type,timezone,achievement_count,kudos_count,manual,max_speed,...,weekofyear,month,dayofweek,weekend,hour,is_uk_awake,run_area,latlng_cluster,city,average_speed_mpk
284,🐅,10.4163,45.566667,18.0,0.0,America/Boise,1,29,False,5.0,...,11,3,2,0,15,1,5.428935,1,Boise,4.374278
285,🦏,16.1663,70.25,20.0,0.0,America/Boise,5,36,False,5.3,...,11,3,2,0,8,1,16.859057,1,Boise,4.345763
286,💩,1.6812,8.166667,8.0,0.0,America/Boise,0,14,False,5.7,...,11,3,1,0,15,1,0.017541,1,Boise,4.857476
287,🦧,12.8514,42.183333,21.0,3.0,America/Boise,0,56,False,6.5,...,11,3,1,0,14,1,0.535336,1,Boise,3.282001
288,🆙,3.3194,15.3,3.0,0.0,America/Boise,0,8,False,6.9,...,11,3,1,0,14,1,0.008149,1,Boise,4.60896


In [64]:
df_example = data[['distance', 'moving_time', 'total_elevation_gain', 'workout_type', 'kudos_count']]
df_example.dropna(inplace=True)

X = df_example.drop(['workout_type'], axis=1)
y = df_example.workout_type

print("before: \n", y.value_counts())
sampling_strategy = {0: 655, 3: 93, 1: 38, 1:30}
smote = SMOTE(random_state=42, sampling_strategy=sampling_strategy)
X_over, y_over = smote.fit_resample(X, y)
print("after: \n", y_over.value_counts())
print(X_over.columns)

# # X_over

before: 
 0.0    655
3.0     93
2.0     38
1.0     12
Name: workout_type, dtype: int64
after: 
 0.0    655
3.0     93
2.0     38
1.0     30
Name: workout_type, dtype: int64
Index(['distance', 'moving_time', 'total_elevation_gain', 'kudos_count'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [84]:
num_cols = [
    "distance",
    "moving_time",
    "total_elevation_gain",
    "max_speed",
    "average_heartrate",
    "max_heartrate",
    "suffer_score",
    "run_area",
    "average_speed_mpk",
]

cat_cols = [
    "workout_type",
    "timezone",
    "manual",
    "dayofweek",
    "weekend",
    "is_uk_awake",
    "latlng_cluster",
    "city",
    "has_photo",
    "run_per_day",
    "max_run",
    "is_named",
]

ordinal_cols = ["hour", "pr_count"]

# all cols are features except for target and kfold
features = num_cols + cat_cols + ordinal_cols

df = data[features]
df = df.dropna()
X = df.drop('workout_type', axis=1)
y = df.workout_type

print("before: \n", y.value_counts())

class_mapping = y.value_counts().to_dict()
class_mapping[min(class_mapping, key=class_mapping.get)] = 30
class_mapping

smotenc = SMOTENC(random_state=42, sampling_strategy=class_mapping, categorical_features=[9,10,11,12,13,14,15,16,17,18,19,20,21])
X_over, y_over = smotenc.fit_resample(X, y)
print("after: \n", y_over.value_counts())


before: 
 0.0    639
3.0     86
2.0     35
1.0     10
Name: workout_type, dtype: int64
after: 
 0.0    639
3.0     86
2.0     35
1.0     30
Name: workout_type, dtype: int64


In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle

from sklearn import metrics
from sklearn import preprocessing
from sklearn import impute
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, vstack
from imblearn.over_sampling import SMOTENC

def run(fold, oversample=False):

    # read training data with folds
    df = pd.read_csv(config.STRAVA_TRAIN_KFOLD_PATH)

    # list all numeric features
    num_cols = [
        "distance",
        "moving_time",
        "total_elevation_gain",
        "max_speed",
        "average_heartrate",
        "max_heartrate",
        "suffer_score",
        "run_area",
        "average_speed_mpk",
    ]

    cat_cols = [
        "workout_type",
        "timezone",
        "manual",
        "dayofweek",
        "weekend",
        "is_uk_awake",
        "latlng_cluster",
        "city",
        "has_photo",
        "run_per_day",
        "max_run",
        "is_named",
    ]

    ordinal_cols = ["hour", "pr_count"]

    # all cols are features except for target and kfold
    features = num_cols + cat_cols + ordinal_cols

    # fill cat column NaN values with NONE
    for col in cat_cols + ordinal_cols:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # training data is where kfold is not equal to fold
    df_train = df[df.kfold != fold].reset_index(drop=True)
    y_train = df_train.kudos_count.values

    # validation data is where kfold = fold
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    y_valid = df_valid.kudos_count.values

    if oversample:
        df_smote = df_train[features + ['kudos_count']]
        df_smote_cat = df_smote.drop(columns=num_cols, axis=1)
        df_smote_num = pd.DataFrame(impute.SimpleImputer(strategy="median").fit_transform(df_smote[num_cols]), columns = num_cols)
        df_smote = pd.concat([df_smote_num, df_smote_cat], axis=1)

        X = df_smote.drop("workout_type", axis=1)
        y = df_smote.workout_type

        class_mapping = y.value_counts().to_dict()
        class_mapping[min(class_mapping, key=class_mapping.get)] = 100

        smotenc = SMOTENC(
            random_state=42,
            sampling_strategy=class_mapping,
            categorical_features=[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
        )
        X_over, y_over = smotenc.fit_resample(X, y)

        df_train = X_over
        df_train.loc[:, "workout_type"] = y_over
        y_train = df_train.kudos_count.values

    # pipelines for model transformation
    num_pipeline = Pipeline([("imputer", impute.SimpleImputer(strategy="median"))])

    cat_pipeline = Pipeline(
        [("cat", preprocessing.OneHotEncoder(handle_unknown="ignore"))]
    )

    # transforms columns and drops columns not specified
    x_train_num = num_pipeline.fit_transform(df_train[num_cols])
    x_train_cat = cat_pipeline.fit_transform(df_train[cat_cols + ordinal_cols])
    x_valid_num = num_pipeline.transform(df_valid[num_cols])
    x_valid_cat = cat_pipeline.transform(df_valid[cat_cols + ordinal_cols])

    # check shapes are the same
    assert (
        x_train_num.shape[0] == y_train.shape[0]
    ), "training data (numeric) and label dimension are not equal"

    assert (
        x_train_cat.shape[0] == y_train.shape[0]
    ), "training data (categorical) and label dimension are not equal"

    assert (
        x_valid_num.shape[0] == y_valid.shape[0]
    ), "validation data (numeric) and label dimension are not equal"

    assert (
        x_valid_cat.shape[0] == y_valid.shape[0]
    ), "validation data (categorical) and label dimension are not equal"

    # join numeric data and categorical data
    x_train = hstack((x_train_num, x_train_cat), format="csr")
    x_valid = hstack((x_valid_num, x_valid_cat), format="csr")

    # initialize xgboost model
    model = xgb.XGBRegressor(n_jobs=-1)

    # fit model on training data
    eval_set = [(x_valid, y_valid)]
    model.fit(
        x_train,
        y_train,
        early_stopping_rounds=10,
        eval_metric="rmse",
        eval_set=eval_set,
        verbose=False,
    )
    # model.fit(x_train, y_train)

    # predict on validation data
    valid_preds = model.predict(x_valid)

    # get rmse, and mape
    rmse = metrics.mean_squared_error(y_valid, valid_preds, squared=False)
    max_error = metrics.max_error(y_valid, valid_preds)
    print(f"\nFold = {fold}, rmse = {rmse}, max error = {max_error}")

    data = [x_train, y_train, x_valid, y_valid]

    return rmse, model, data

scores = []
for fold_ in range(3):
    rmse, _, _ = run(fold_)
    scores.append(rmse)
print(f"\nAverage rmse = {sum(scores) / len(scores)}")



Fold = 0, rmse = 10.10264849691973, max error = 57.32171630859375

Fold = 1, rmse = 10.027582425651662, max error = 67.44133377075195

Fold = 2, rmse = 10.872814989844134, max error = 60.55420684814453

Average rmse = 10.334348637471843


In [12]:
scores = []
for fold_ in range(3):
    rmse, _, _ = run(fold_, oversample=True)
    scores.append(rmse)
print(f"\nAverage rmse = {sum(scores) / len(scores)}")


Fold = 0, rmse = 9.959862184726507, max error = 69.08308410644531

Fold = 1, rmse = 10.99699723694694, max error = 71.66761779785156

Fold = 2, rmse = 12.102644236194953, max error = 95.45096588134766

Average rmse = 11.019834552622802
