Tudo da vida segue uma distribuição gaussiana.
-V.C.A Marcelo

### Libraries

In [None]:
import pandas as pd
import numpy as np
from custom_metrics import regression_metrics

### Functions

In [None]:
def order_by_x(x, y):
    # developed by segfault
    lst = list(zip(x_train, y_train))
    lst_ordered = sorted(lst)
    xt, yt = list(zip(*lst_ordered))
    xt = list(xt)
    yt = list(yt)
    
    return(xt, yt)

### Reading dataframes

In [None]:
df_gold = pd.read_csv('../data/d_kilo_gold_price_in_currency.csv')
df_gold['date'] = pd.to_datetime(df_gold['date'], infer_datetime_format=True)

### Configuration

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')

COLOR_DATA_TEST="blue"
COLOR_DATA_PRED="red"

In [None]:
from sklearn.model_selection import train_test_split

data_x = np.arange(0, len(df_gold['date'])).reshape(-1,1)
data_y = np.array(df_gold['China'])

# nao pode embaralhar os dados senao deixa de ser uma serie temporal!
x_train, x_test, y_train, y_test = train_test_split(data_x, 
                                                    data_y, 
                                                    shuffle=False,
                                                    test_size=0.20)
#x_test = x_test - x_test[0]
print("Train length:", len(x_train))
print("Test  length:", len(x_test))

### Visualization


In [None]:
import matplotlib.pyplot as plt
plt.plot(data_x, data_y)

## Linear Regression

[source](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py)

In [None]:
from sklearn import linear_model
import matplotlib.pyplot as plt

model = linear_model.LinearRegression()
model.fit(x_train, y_train.reshape(-1,1))

print("Coefficient: ", model.coef_[0][0])

y_pred = model.predict(x_test)

plt.scatter(x_test, y_test, color=COLOR_DATA_TEST)
plt.scatter(x_test, y_pred, color=COLOR_DATA_PRED)
plt.show()

regression_metrics(y_test, y_pred, verbose=True)

## Polynomial Regression

[source](https://towardsdatascience.com/polynomial-regression-with-scikit-learn-what-you-should-know-bed9d3296f2)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

best_degree = None
best_r2 = float('-inf')

for degree in range(2,16):
    
    poly_reg = make_pipeline(PolynomialFeatures(degree),LinearRegression())
    poly_reg.fit(x_train, y_train)
    y_pred = poly_reg.predict(x_test)
    
    r2 = regression_metrics(y_test, y_pred)
    
    if r2 > best_r2:
        best_r2 = r2
        best_degree = degree
        print('Best r2 so far = ', best_r2, '\t@ degree = ', best_degree)

poly_reg = make_pipeline(PolynomialFeatures(best_degree),LinearRegression())
poly_reg.fit(x_train, y_train)
y_pred = poly_reg.predict(x_test)

plt.figure()
plt.scatter(x_test, y_test, color=COLOR_DATA_TEST)
plt.scatter(x_test, y_pred,color=COLOR_DATA_PRED)
plt.title(('Polynomial Regression', 'degree', str(best_degree)))
plt.show()

regression_metrics(y_test, y_pred, verbose=True)

## Support Vector Regression

[source](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html)

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

parameters = { 'C'       : np.logspace(-12, 1, num=14, base=10.0), 
               'epsilon' : np.logspace(-12, 1, num=14, base=10.0),
               'gamma'   : ['scale', 'auto'],
               'kernel'  : ['rbf', 'sigmoid'] }

# https://stackoverflow.com/a/43366811/2679529
clf = make_pipeline(StandardScaler(), 
                    GridSearchCV(SVR(),
                                 param_grid=parameters,
                                 cv=5,
                                 n_jobs=4,
                                 refit=True,
                                 scoring='r2',
                                 verbose=1))
clf.fit(x_train, y_train)

# https://scikit-learn.org/stable/modules/compose.html#nested-parameters
best_c = clf.get_params()['gridsearchcv__estimator__C']
best_epsilon = clf.get_params()['gridsearchcv__estimator__epsilon']
best_gamma = clf.get_params()['gridsearchcv__estimator__gamma']
best_kernel = clf.get_params()['gridsearchcv__estimator__kernel']
print('C =', best_c, 'epsilon =', best_epsilon, 'gamma = ', best_gamma, 'kernel =', best_kernel)

y_pred = clf.predict(x_test)

r2 = regression_metrics(y_test, y_pred)
        
plt.figure()
plt.scatter(x_test, y_test, color=COLOR_DATA_TEST)
plt.scatter(x_test, y_pred,color=COLOR_DATA_PRED)
plt.title(('Support Vector Regression', 
           'C', best_c, 'epsilon', best_epsilon, 'gamma', best_gamma, 'kernel', best_kernel))

plt.show()

regression_metrics(y_test, y_pred, verbose=True)

## Multi-layer Perceptron Regressor

[source](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor)

In [None]:
from sklearn.neural_network import MLPRegressor

parameters = { 'activation'         : ['identity', 'logistic', 'tanh', 'relu'], 
               'hidden_layer_sizes' : list((i,i) for i in np.linspace(100,1000,19,dtype='int64')),
               'learning_rate'      : ['constant', 'invscaling', 'adaptive'] }

# https://stackoverflow.com/a/43366811/2679529
clf = make_pipeline(StandardScaler(), 
                    GridSearchCV(MLPRegressor(),
                                 param_grid=parameters,
                                 cv=5,
                                 n_jobs=4,
                                 refit=True,
                                 scoring='r2',
                                 verbose=1))

clf.fit(x_train, y_train)

# https://scikit-learn.org/stable/modules/compose.html#nested-parameters
best_activation = clf.get_params()['gridsearchcv__estimator__activation']
best_hidden_layer_sizes = clf.get_params()['gridsearchcv__estimator__hidden_layer_sizes']
best_learning_rate = clf.get_params()['gridsearchcv__estimator__learning_rate']
print('activation =', best_activation, 
      'hidden_layer_sizes =', best_hidden_layer_sizes, 
      'learning_rate =', best_learning_rate)

y_pred = clf.predict(x_test)

r2 = regression_metrics(y_test, y_pred)

plt.scatter(x_test, y_test, color=COLOR_DATA_TEST)
plt.scatter(x_test, y_pred, color=COLOR_DATA_PRED)
plt.title(('Multi-layer Perceptron Regressor', 
           'activation', best_activation, 
           'hidden_layer_sizes', best_hidden_layer_sizes, 
           'learning_rate', best_learning_rate))
plt.show()

regression_metrics(y_test, y_pred, verbose=True)

## Prophet

[source](https://facebook.github.io/prophet/docs/quick_start.html)

In [None]:
df_gold = pd.read_csv('../data/d_kilo_gold_price_in_currency.csv')
df_gold['date'] = pd.to_datetime(df_gold['date'], infer_datetime_format=True)

from sklearn.model_selection import train_test_split

data_x = np.array(df_gold['date'])
data_y = np.array(df_gold['China'])

x_train, x_test, y_train, y_test = train_test_split(data_x, 
                                                    data_y, 
                                                    shuffle=False,
                                                    test_size=0.50)
print("Train length:", len(x_train))
print("Test  length:", len(x_test))

import pandas as pd
from fbprophet import Prophet

df = pd.DataFrame(list(zip(x_train, y_train)), columns =['ds', 'y']) 
m = Prophet()
m.fit(df)

future = m.make_future_dataframe(periods=365)
future.tail()
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
fig1 = m.plot(forecast)

In [None]:
import pystan
model_code = 'parameters {real y;} model {y ~ normal(0,1);}'
model = pystan.StanModel(model_code=model_code)  # this will take a minute
y = model.sampling(n_jobs=1).extract()['y']
y.mean()  # should be close to 0