In [1]:
import numpy as np
import pandas as pd

In [2]:
jogadores = pd.read_excel("JOGADORES_NBA.xlsx")
metricas_nba = pd.read_excel("METRICAS_NBA.xlsx")

In [95]:
# useful functions, keep them all here

def quantile_replacement(dataset, identifier, lower, upper, method):
    dataset = dataset.copy(deep=True)
    
    lower_quantile = dataset[identifier].quantile(lower)
    upper_quantile = dataset[identifier].quantile(upper)

    filter_matrix = (dataset[identifier] < upper_quantile) & (dataset[identifier] > lower_quantile)

    try:
        replacement = getattr(dataset[filter_matrix][identifier], method)()
    except AttributeError:
        raise ValueError("Invalid method: {}".format(method))

    dataset.loc[np.invert(filter_matrix), identifier] = replacement
    return dataset

In [96]:
# data pre-processing

dataset = jogadores.merge(metricas_nba, on=("Player"))

for extremised_feature in ("height", "weight", "TS%", "FTr"):
    dataset = quantile_replacement(dataset, extremised_feature, 0.003, 0.997, method="mean")
    print(extremised_feature, dataset[extremised_feature].min(),
          dataset[extremised_feature].max(), dataset[extremised_feature].mean())

for missing_data_feature in ("birth_city", "birth_state", "collage"):
    dataset.loc[dataset[missing_data_feature].isnull(), missing_data_feature] = "N/A"
    print("Set {} missing entries from column '{}' as N/A.".format(
        len(dataset[dataset[missing_data_feature] == "N/A"]), missing_data_feature
    ))

dataset["Age"] = pd.qcut(dataset["Age"], q=3, labels=["Junior", "Middle", "Senior"])

height 173.0 226.0 199.43174187208254
weight 63.0 133.0 95.45299145299145
TS% 0.046 0.85 0.4956298891315996
FTr 0.01 1083.0 1.4394350268170109
Set 1646 missing entries from column 'birth_city' as N/A.
Set 1689 missing entries from column 'birth_state' as N/A.
Set 2223 missing entries from column 'collage' as N/A.
