# Setup inicial

In [None]:
!wget -q --show-progress http://cin.ufpe.br/~llm2/preprocessedAnime.csv.zip -O preprocessedAnime.zip
!unzip -o preprocessedAnime.zip

In [None]:
!pip install wandb -qq

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

import wandb

# Load e separação dos dados 

In [2]:
animeList = pd.read_csv("./preprocessedAnime.csv")

In [3]:
animeList

Unnamed: 0,score,members,favorites,episodes,favorite_per_member,Action,Adventure,Cars,Comedy,Dementia,...,source_Manga,source_Music,source_Novel,source_Original,source_Other,source_Picture book,source_Radio,source_Unknown,source_Visual novel,source_Web manga
0,8.78,0.310238,0.073638,25.0,-1.763062,0,0,0,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8.78,0.552702,0.383489,22.0,-1.297178,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.77,0.333283,0.163512,13.0,-1.447741,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,9.22,0.870886,1.000000,64.0,-1.078380,1,1,0,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.82,0.121661,0.024933,1.0,-1.826967,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9863,7.42,0.016731,0.000679,1.0,-2.528595,1,0,0,1,0,...,,,,,,,,,,
9864,7.44,0.032753,0.002098,12.0,-2.331827,0,0,0,1,0,...,,,,,,,,,,
9865,7.53,0.032597,0.000911,12.0,-2.690232,0,0,0,1,0,...,,,,,,,,,,
9866,7.51,0.020093,0.000819,1.0,-2.526598,1,0,0,0,0,...,,,,,,,,,,


In [4]:
y = pd.DataFrame()
y["score"] = animeList['score']
X = animeList.drop(['score'],axis=1)
X = X.fillna(0)

Dividir 60% para treino, 20% para testes e 20% para validacao

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2020)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=2020)

In [None]:
print(len(X_train))
print(len(X_test))
print(len(X_val))

In [None]:
# wandb.init(entity="ds-gtsa-llm2", project="sklearn")

In [None]:
# reg = GradientBoostingRegressor(n_estimators=100, random_state=2020)
# reg.fit(X_train, y_train.values.ravel())
# y_pred = reg.predict(X_test)

# # Visualize all regression plots
# wandb.sklearn.plot_regressor(reg, X_train, X_test, y_train.values.ravel(), y_test.values.ravel(), 'Gradient Boosting')

# Modelos

Para cada modelo foi criado um dicionário que representa a configuração de um sweep do Weights & Biases (wandb). Cada sweep foi executado pelo agente do wandb, com aproximadamente 150 runs cada um.

O MSE foi utilizado como métrica de comparação entre os modelos.

O projeto do wandb pode ser encontrado neste [link](https://wandb.ai/ds-gtsa-llm2/trending-mal/)

## Random Forest

In [None]:
def train_rforest():
    config_default = {
        "n_estimators":100,
        "max_depth": 3,
        "min_samples_leaf":1
    }
    wandb.init(config=config_default,magic=True)
    #Train model
    reg = RandomForestRegressor(n_estimators=wandb.config.n_estimators,
                                max_depth=wandb.config.max_depth,
                                min_samples_leaf = wandb.config.min_samples_leaf ,
                                random_state=2020)

    reg.fit(X_train, y_train.values.ravel())
    y_pred = reg.predict(X_val)

    wandb.sklearn.plot_regressor(reg, X_train, X_val, y_train.values.ravel(), y_val.values.ravel(), 'Random Forest')

    mse = mean_squared_error(y_val, y_pred)
    wandb.log({"loss": mse})

In [None]:
sweep_config = {
  "name": "Random Forest",
  "method": "bayes",
  "metric": {
      "goal": "minimize",
      "name": "loss"
  },
  "early_terminate": {
      "type" : "hyperband",
      "max_iter": 150
  },
  "parameters": {
        "n_estimators": {
            "distribution": "int_uniform",
            "max": 300,
            "min": 100
        },
        "max_depth":{
            "distribution": "int_uniform",
            "max": 30,
            "min": 3
        },
        "min_samples_leaf": {
            "distribution": "int_uniform",
            "max": 20,
            "min": 1
        }
    }
}
sweep_id_rforest = wandb.sweep(sweep_config, entity="ds-gtsa-llm2", project="trending-mal")

In [None]:
wandb.agent(sweep_id_rforest, function=train_rforest)

## Gradient Boosting

In [None]:
def train_grad():
    config_default = {
        "n_estimators": 100,
        "max_depth": 3,
        "learning_rate": 0.1
    }
    wandb.init(config=config_default, magic=True)

    # Train model, get predictions
    reg = GradientBoostingRegressor(n_estimators=wandb.config.n_estimators,
                                    max_depth=wandb.config.max_depth,
                                    learning_rate=wandb.config.learning_rate,
                                    random_state=2020)
    reg.fit(X_train, y_train.values.ravel())
    y_pred = reg.predict(X_val)

    wandb.sklearn.plot_regressor(reg, X_train, X_val, y_train.values.ravel(), y_val.values.ravel(), 'Gradient Boosting')

    mse = mean_squared_error(y_val, y_pred)
    wandb.log({"loss": mse})

In [None]:
sweep_config = {
  "name": "Gradient boost sweep",
  "method": "bayes",
  "metric": {
      "goal": "minimize",
      "name": "loss"
  },
  "early_terminate": {
      "type" : "hyperband",
      "min_iter": 150
  },
  "parameters": {
        "n_estimators": {
            "distribution": "int_uniform",
            "max": 200,
            "min": 100
        },
        "max_depth":{
            "distribution": "int_uniform",
            "max": 15,
            "min": 3
        },
        "learning_rate": {
            "values": [0.01, 0.03, 0.1]
        }
    }
}

sweep_id_grad = wandb.sweep(sweep_config, entity="ds-gtsa-llm2", project="trending-mal-test")

In [None]:
wandb.agent(sweep_id_grad, function=train_grad)

## Support Vector Machine

In [None]:
def train_svm():
    config_default = {
        "max_iter":1000,
        "C":1,
        "loss": "epsilon_insensitive"
    }
    wandb.init(config=config_default,magic=True)
    #Train model
    reg = LinearSVR(
        max_iter=wandb.config.max_iter,
        C=wandb.config.C,
        loss=wandb.config.loss
    )

    reg.fit(X_train, y_train.values.ravel())
    y_pred = reg.predict(X_val)

    wandb.sklearn.plot_regressor(reg, X_train, X_val, y_train.values.ravel(), y_val.values.ravel(), 'SVM')

    mse = mean_squared_error(y_val, y_pred)
    wandb.log({"loss": mse})

In [None]:
sweep_config = {
  "name": "SVM sweep",
  "method": "bayes",
  "metric": {
      "goal": "minimize",
      "name": "loss"
  },
  "early_terminate": {
      "type" : "hyperband",
      "min_iter": 150
  },
  "parameters": {
        "max_iter": {
            "distribution": "int_uniform",
            "max": 200,
            "min": 75
        },
        "C":{
            "max": 2.1,
            "min": 0
        },
        "loss": {
            "values": ["epsilon_insensitive", "squared_epsilon_insensitive"]
        }
    }
}
sweep_id_svm = wandb.sweep(sweep_config, entity="ds-gtsa-llm2", project="trending-mal")

In [None]:
wandb.agent(sweep_id_svm, function=train_svm)

## MLP

In [None]:
def train_mlp():
    config_default = {
        "hidden_layer_sizes": (100, 100),
        "batch_size": 100,
        "learning_rate": "constant"
    }
    wandb.init(config=config_default, magic=True)

    # Train model, get predictions
    reg = MLPRegressor(hidden_layer_sizes=(wandb.config.hidden_layer_sizes, wandb.config.hidden_layer_sizes),
                                    batch_size=wandb.config.batch_size,
                                    learning_rate=wandb.config.learning_rate,
                                    max_iter=1000,
                                    random_state=2020)
    reg.fit(X_train, y_train.values.ravel())
    y_pred = reg.predict(X_val)

    wandb.sklearn.plot_regressor(reg, X_train, X_val, y_train.values.ravel(), y_val.values.ravel(), 'MLP')

    mse = mean_squared_error(y_val, y_pred)
    wandb.log({"loss": mse})

In [None]:
sweep_config = {
  "name": "MLP sweep",
  "method": "bayes",
  "metric": {
      "goal": "minimize",
      "name": "loss"
  },
  "early_terminate": {
      "type" : "hyperband",
      "min_iter": 150
  },
  "parameters": {
        "hidden_layer_sizes": {
            "distribution": "int_uniform",
            "max": 200,
            "min": 75
        },
        "batch_size":{
            "distribution": "int_uniform",
            "max": 250,
            "min": 100
        },
        "learning_rate": {
            "values": ["constant", "invscaling", "adaptive"]
        }
    }
}

sweep_id_mlp = wandb.sweep(sweep_config, entity="ds-gtsa-llm2", project="trending-mal-test")

In [None]:
wandb.agent(sweep_id_mlp, function=train_mlp)

# Performance

Depois de rodado todos os sweeps, cada modelo gerou uma tabela com as performances, onde será mostrado as primeiras runs ordenadas pelo MSE.

Também será mostrado a importancia de cada hiperparâmetro alterado nas configs acima.

(Obs: o notebook mostrado no GitHub pode não carregar as imagens, porém rodando localmente elas são mostradas)

## Random Forest

![RF Table](assets/rf_table.png)

![RF importance](assets/rf_importance.png)

A random forest que obteve o melhor resultado teve uma loss de 0,2210, com os hiperparâmetros:

* min_samples_leaf = 1
* n_estimators = 123
* max_depth = 19

## Gradient Boosting

![Gradient table](assets/gradient_table.png)

![Gradient importance](assets/gradient_importance.png)

O gradient boosting que obteve o melhor resultado teve uma loss de 0,2177, com os hiperparâmetros:


*   learning_rate = 0.1
*   n_estimators = 130
*   max_depth = 5



## SVM

![SVM Table](assets/svm_table.png)

![SVM importance](assets/svm_importance.png)

O SVM que obteve o melhor resultado teve uma loss de 0,4036, com os hiperparâmetros:


*   loss = squared_epsilon_insensitive
*   C = 0.3076
*   max_iter = 184



## MLP

![MLP Table](assets/mlp_table.png)

![MLP importance](assets/mlp_importance.png)

A MLP que obteve o melhor resultado teve uma loss de 0,3846, com os hiperparâmetros:


*   learning_rate = adaptive
*   batch_size = 132
*   hidden_layers = 91



## Melhor resultado

Comparando os resultado acima, o modelo com o menor MSE foi o gradient boosting

In [7]:
def print_model(model): 
    model.fit(X_train, y_train.values.ravel())
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)

    print("MSE treino: {}".format(mean_squared_error(y_train, y_pred_train)))
    print("MSE validacao: {}".format(mean_squared_error(y_val, y_pred_val)))
    print("MSE teste: {}".format(mean_squared_error(y_test, y_pred_test)))

In [8]:
reg = GradientBoostingRegressor(n_estimators=130,
                                    max_depth=5,
                                    learning_rate=0.1,
                                    random_state=2020)

print_model(reg)

MSE treino: 0.14947984621604074
MSE validacao: 0.2176462912405008
MSE teste: 0.254575940062399


### Comparação com o Baseline

Para o baseline, foi utilizado o auto-sklearn. Os resultados se encontram neste [link](https://github.com/lionliu/trending-mal/blob/master/auto-sklearn.ipynb)

Ele obteve os seguintes MSE's:


*   Treino: 0.1056
*   Validação: 0.2127
*   Teste: 0.2482

Tomando o MSE do conjunto de treinamento do auto-sklearn como o menor MSE possivel para um modelo, podemos concluir que:
* A diferença do MSE de treino do gradient boosting e do auto-sklearn: 0.1494 - 0.1056 = 0.0438
    * A diferença não é tão grande, logo possui um bias baixo, ou seja, não está sofrendo underfitting
* A diferença do MSE de treino e validação do gradient boosting: 0.2176 - 0.1494 = 0.0682
    * Mesmo que a diferença (variância) não seja tão grande, é um pouco maior que a bias, podendo concluir que está sofrendo um pouco de overfit

Para enfatizar que o modelo está sofrendo overfitting, segue o gráfico que o wandb disponibiliza para alguns valores preditos no eixo y, e sua diferença ao valor original no eixo x.

![Residuals](assets/residuals.png)

É possivel observar que ainda há muitos pontos longe da diferença igual a 0, enfatizando a variância.

## Melhorando o Modelo

Visto acima que o modelo está sofrendo um pouco de overfitting, será feito o seguinte para tentar melhora-lo:
* Utilizar outros hiperparâmetros que estão no melhor gradient boosting do auto-sklearn e que não foram alterados no nosso modelo
* Early stopping, mantendo ainda os hiperparâmetros achados acima
* Aplicar normalização nas colunas de favorite_per_member e episodes
* Aumentar a proporcao do conjunto de treinamento

### Outros hiperparâmetros

In [9]:
regTest = GradientBoostingRegressor(n_estimators=130,
                                    max_depth=5,
                                    learning_rate=0.1,
                                    max_leaf_nodes=32,
                                    min_samples_leaf=14,
                                    tol=1e-07,
                                    random_state=2020)

print_model(regTest)

MSE treino: 0.1759966229255667
MSE validacao: 0.21905285094283677
MSE teste: 0.25028364311513196


### Early stopping

Procurar os hiperparâmetros de early stopping que dão o menor MSE para o conjunto de validação

In [7]:
def train_grad2():
    config_default = {
        "n_iter_no_change": 4,
        "validation_fraction": 0.1
    }
    wandb.init(config=config_default, magic=True)

    # Train model, get predictions
    reg = GradientBoostingRegressor(n_estimators=130,
                                max_depth=5,
                                learning_rate=0.1,
                                random_state=2020,
                                n_iter_no_change=wandb.config.n_iter_no_change,
                                validation_fraction=wandb.config.validation_fraction)
    
    reg.fit(X_train, y_train.values.ravel())
    y_pred = reg.predict(X_val)

    wandb.sklearn.plot_regressor(reg, X_train, X_val, y_train.values.ravel(), y_val.values.ravel(), 'Gradient Boosting')

    mse = mean_squared_error(y_val, y_pred)
    wandb.log({"loss": mse})

In [8]:
sweep_config = {
  "name": "Gradient boosting early stop sweep",
  "method": "grid",
  "metric": {
      "goal": "minimize",
      "name": "loss"
  },
  "parameters": {
        "n_iter_no_change": {
            "values": [4,5,6,7,8]
        },
        "validation_fraction":{
            "values": [0.1, 0.2, 0.3]
        }
    }
}

sweep_id_grad2 = wandb.sweep(sweep_config, entity="ds-gtsa-llm2", project="trending-mal")

Create sweep with ID: 8i10mec7
Sweep URL: https://wandb.ai/ds-gtsa-llm2/trending-mal/sweeps/8i10mec7


In [None]:
wandb.agent(sweep_id_grad2, function=train_grad2)

O melhor resultado foi um MSE de 0.2218, com os hiperparâmetros:
* n_iter_no_change: 5
* validation_fraction: 0.1

In [10]:
reg_early_stopping = GradientBoostingRegressor(n_estimators=130,
                                    max_depth=5,
                                    learning_rate=0.1,
                                    random_state=2020,
                                    n_iter_no_change=5,
                                    validation_fraction=0.1)
print_model(reg_early_stopping)

MSE treino: 0.1648793310832751
MSE validacao: 0.22184051336704796
MSE teste: 0.25575017567210884


### Normalização

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [12]:
tempAnime = animeList.copy()

tempAnime['episodes'] = scaler.fit_transform(tempAnime[['episodes']])
tempAnime['favorite_per_member'] = scaler.fit_transform(tempAnime[['favorite_per_member']])

y = pd.DataFrame()
y["score"] = tempAnime['score']
X = tempAnime.drop(['score'],axis=1)
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2020)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=2020)

In [13]:
print_model(reg)

MSE treino: 0.14947984621604074
MSE validacao: 0.21752816480804607
MSE teste: 0.254575940062399


### Maior proporção de treino

In [14]:
y = pd.DataFrame()
y["score"] = tempAnime['score']
X = tempAnime.drop(['score'],axis=1)
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=2020)

In [15]:
print_model(reg)

MSE treino: 0.16377432598725722
MSE validacao: 0.2207831156418122
MSE teste: 0.21323205794618894


## Conclusão

* Usar os hiperparâmetros do auto-sklearn apenas melhorou um pouco do MSE do teste, porém não houve melhoras no de validação e de treino.
* O early stopping não serviu para melhorar o modelo em nenhum dos conjuntos
* A normalização manteve quase a mesma performance
* Um conjunto de treinamento maior ajudou a diminuir o erro no conjunto de teste

No final, não foi possivel melhorar o modelo em relação ao overfitting. Uma opção para melhorar seria coletar mais dados para aumentar o tamanho do conjunto de treinamento, mantendo o mesmo tamanho para o conjunto de validação e teste, porém o dataset já contém boa parte dos dados do MyAnimeList, o que não seria possivel.

Também foi realizado um clustering dos dados, que se encontra no [projeto anterior](https://github.com/lionliu/trending-mal/blob/master/myAnimeListAnalysis.ipynb)