# 1-. First training model (Benchmark and tree models)

In this notebook We are going to train the models using linear regression, decision tree and random forest

In [20]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import time
import config

In [21]:

X_train= pd.read_csv(r'data/train4.csv')
X_test= pd.read_csv(r'data/test4.csv')
#X_test.head()

In [22]:
X_train.rating_y.value_counts()

8.0     1481647
7.0     1237440
9.0     1128834
10.0     860222
6.0      573818
5.0      254537
4.0       93895
3.0       37271
2.0       20883
1.0       14968
Name: rating_y, dtype: int64

In [23]:
y_train=X_train.rating_y
y_test=X_test.rating_y
X_train=X_train.drop('rating_y',axis=1)
X_test=X_test.drop('rating_y',axis=1)

In [24]:
final_features=X_train.columns
X_train.head()

Unnamed: 0,anime_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating_x,members,episodes,user_id
0,437,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,8.23,117565,1,28669
1,4181,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,9.06,456749,24,52535
2,10080,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,8.12,194300,12,48664
3,9936,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,6.69,104182,12,32860
4,329,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,8.38,105044,26,46146


In [25]:
#Estandarización de los datos
scaler=StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#X_train

In [26]:
X_test

array([[ 1.46100111, -0.85165055, -0.52400848, ..., -0.8233018 ,
        -0.50431488, -1.34037454],
       [-0.90677923, -0.85165055, -0.52400848, ...,  1.27810014,
        -0.13499714,  0.74082814],
       [ 1.6240323 , -0.85165055, -0.52400848, ..., -0.64763193,
        -0.1634062 , -1.25319378],
       ...,
       [ 1.46122629,  1.17419052,  1.90836606, ...,  1.85072692,
         0.14909343, -1.29411924],
       [ 0.32158423,  1.17419052, -0.52400848, ...,  0.73186208,
         0.14909343, -0.82875863],
       [-0.93819201, -0.85165055, -0.52400848, ..., -0.63953409,
        -0.50431488, -0.00648989]])

# Linear Regression (Benchmark model)
### Linear regression is the proposed benchmark model, it does not handle hyperparameters that must be configured by the analyst, so it is a good model to start with. 3 metrics for this and the rest of the models are shown below. MSE, RMSE and MAE. For this case, the one we will use to evaluate the models later will be the MAE, given its simplicity

In [27]:
#Linear regression
start = time.time()
reg = LinearRegression().fit(X_train, y_train)
pred = reg.predict(X_train)
end = time.time()

In [28]:
print('train mse: {}'.format(
    mean_squared_error(y_train, pred)))
print('train rmse: {}'.format(
    np.sqrt(mean_squared_error(y_train, pred))))
print('train mae: {}'.format(
    mean_absolute_error(y_train, pred)))
print('time is: {} seconds or {} minutes'. format(round(
    end-start),round((end-start)/60)))
print()

train mse: 2.0545182182380306
train rmse: 1.4333590681465795
train mae: 1.1105435107937744
time is: 38 seconds or 1 minutes



In [29]:
pred=reg.predict(X_test)

print('test mse: {}'.format(
    mean_squared_error(y_test, pred)))
print('test rmse: {}'.format(
    np.sqrt(mean_squared_error(y_test, pred))))
print('test mae: {}'.format(
    mean_absolute_error(y_test, pred)))
print()

test mse: 2.0519569536783964
test rmse: 1.4324653411787653
test mae: 1.1096444857564762



# Decision Tree

In [30]:
start = time.time()
dt = DecisionTreeRegressor(random_state = 42,max_depth=10)
dt.fit(X_train,y_train)
end = time.time()

In [31]:
pred=dt.predict(X_train)

In [32]:
print('train mse: {}'.format(
    mean_squared_error(y_train, pred)))
print('train rmse: {}'.format(
    np.sqrt(mean_squared_error(y_train, pred))))
print('train mae: {}'.format(
    mean_absolute_error(y_train, pred)))
print('time is: {} seconds or {} minutes'. format(round(
    end-start),round((end-start)/60)))
print()

train mse: 2.04868706883833
train rmse: 1.4313235374429953
train mae: 1.1089502103258633
time is: 159 seconds or 3 minutes



In [33]:
pred = dt.predict(X_test)

print('test mse: {}'.format(
    mean_squared_error(y_test, pred)))
print('test rmse: {}'.format(
    np.sqrt(mean_squared_error(y_test, pred))))
print('test mae: {}'.format(
    mean_absolute_error(y_test, pred)))
print()

test mse: 2.0499921835928254
test rmse: 1.431779376717246
test mae: 1.1090337509019268



# Random Forest

In [34]:

start = time.time()
rf = RandomForestRegressor(n_estimators = 20,
                           random_state = 42, 
                           max_depth=10,
                          n_jobs=-1)

rf.fit(X_train, y_train)
end = time.time()

In [35]:
pred=rf.predict(X_train)

In [36]:
print('train mse: {}'.format(
    mean_squared_error(y_train, pred)))
print('train rmse: {}'.format(
    np.sqrt(mean_squared_error(y_train, pred))))
print('train mae: {}'.format(
    mean_absolute_error(y_train, pred)))
print('time is: {} seconds or {} minutes'. format(round(
    end-start),round((end-start)/60)))
print()

train mse: 2.0459348930162427
train rmse: 1.4303618049347664
train mae: 1.1083751716553152
time is: 703 seconds or 12 minutes



In [37]:
pred = rf.predict(X_test)

In [38]:
print('test mse: {}'.format(
    mean_squared_error(y_test, pred)))
print('test rmse: {}'.format(
    np.sqrt(mean_squared_error(y_test, pred))))
print('test mae: {}'.format(
    mean_absolute_error(y_test, pred)))
print()

test mse: 2.0473905420604557
test rmse: 1.4308705539148032
test mae: 1.1084647611047809



## Feature Importance

In [39]:
for name, score in zip(final_features, rf.feature_importances_):
    print(name,round(score,4))

anime_id 0.0015
Action 0.0001
Adventure 0.0001
Cars 0.0
Comedy 0.0001
Dementia 0.0001
Demons 0.0001
Drama 0.0001
Ecchi 0.0001
Fantasy 0.0001
Game 0.0001
Harem 0.0001
Hentai 0.0001
Historical 0.0
Horror 0.0001
Josei 0.0
Kids 0.0001
Magic 0.0
MartialArts 0.0
Mecha 0.0001
Military 0.0001
Music 0.0
Mystery 0.0
Parody 0.0001
Police 0.0
Psychological 0.0
Romance 0.0001
Samurai 0.0
School 0.0001
Sci-Fi 0.0001
Seinen 0.0
Shoujo 0.0
ShoujoAi 0.0
Shounen 0.0001
ShounenAi 0.0
SliceofLife 0.0
Space 0.0
Sports 0.0
SuperPower 0.0
Supernatural 0.0001
Thriller 0.0
Vampire 0.0
Yaoi 0.0
Yuri 0.0
type_Movie 0.0001
type_Music 0.0
type_ONA 0.0
type_OVA 0.0001
type_Special 0.0001
type_TV 0.0001
rating_x 0.9806
members 0.0022
episodes 0.0013
user_id 0.012
