# Forecasting future corn and soybean prices: an analysis of the use of textual information to enrich time series.

### Autores:

- Ivan José dos Reis Filho
- Guilherme Bittencourt
- Guilherme Mendonça Freire
- Solange Oliveira Rezende

### Materiais e Métodos:

- Time Series of Soybean and Corn (2014 to 2020);
- Forecasting using attributes extracted from Kaggle (USDA);
- Target assignment for classification - Decision Tree (DT);
- Sliding window of months for training and months for test;
- Support Vector Regression (SVR).

Link: [www.kaggle.com/ainslie/usda-wasde-monthly-corn-soybean-projections](https://www.kaggle.com/datasets/ivanfilhoreis/corn-and-soybean-news-and-trend)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import iplot
import pandas as pd
import numpy as np
import re, string
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from math import sqrt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Carregando dados da serie temporal
corn_st = pd.read_csv('USDA_corn.csv', header=0, parse_dates=True, squeeze=True)
soybean_st = pd.read_csv('USDA_soybean.csv', header=0, parse_dates=True, squeeze=True)

#Carregando textos
corn_texts = pd.read_csv('Headlines_Corn.csv', header=0, parse_dates=True, squeeze=True)
soybean_texts = pd.read_csv('Headlines_Soybean.csv', header=0, parse_dates=True, squeeze=True)

# **Data Pre-processing**

This section presented data pre-processing of corn and soybean.

- Data Visualization - corn and soybean futures prices;
- Target assignment for classification - Decision Tree (DT);
- Data Correlation - Times-Series.

In [None]:
#Data Visualization - corn and soybean futures prices;

fig_corn = go.Figure()
#fig_corn.add_trace(go.Scatter(x=corn_st['Date'], y=corn_st['Value'], mode='lines', name='Day value (closing)'))
fig_corn.add_trace(go.Scatter(x=corn_st['Date'], y=corn_st['Average'], mode='lines', name='Monthly average'))
#fig_corn.add_trace(go.Scatter(x=corn_st['Date'], y=corn_st['Median'], mode='lines', name='Median (month)'))
fig_corn.update_layout( title="Corn monthly values", xaxis_title="Date", yaxis_title="dollar (US$)")
fig_corn.show()

fig_soy = go.Figure()
#fig_soy.add_trace(go.Scatter(x=soybean_st['Date'], y=soybean_st['Value'], mode='lines', name='Day value (closing)'))
fig_soy.add_trace(go.Scatter(x=soybean_st['Date'], y=soybean_st['Average'], mode='lines', name='Monthly average'))
#fig_soy.add_trace(go.Scatter(x=soybean_st['Date'], y=soybean_st['Median'], mode='lines', name='Median (month)'))
fig_soy.update_layout( title="Soybean monthly values", xaxis_title="Date", yaxis_title="dollar (US$)")
fig_soy.show()

In [None]:
#Target assignment for classification - Decision Tree (DT);

corn = corn_st
soybean = soybean_st
val_c = corn_st['Average']
val_s = soybean_st['Average']
#cor = []

#Tres target diferentes para 1%, 4% e 7% em comparação ao valor do mês anterior.
por = [1.005, 1.04, 1.07]
rot = ["Por1", "Por4", "Por7"]

#Target for Corn: 0 (Neutro/Cai) - 1 (Sobe)
for p in range(0, len(por)):
  col = [0]
  for i in range (1, len(val_c)):
    #cor.append(abs(val_c[i] - val_c[i-1]))
    ant_p = float(val_c[i-1]) * por[p]
    atu = float(val_c[i])

    if(atu > ant_p):
      #print("1")
      col.append(1)
    else:
      #print("0")
      col.append(0)
  data = pd.DataFrame(col, columns=[rot[p]])
  corn = pd.concat([corn, data], axis=1)
print(corn)
print("Dimensão da base original: ", corn_st.shape)

#Target for Soybean: 0 (Neutro/Cai) - 1 (Sobe)
for p in range(0, len(por)):
  col = [0]
  for i in range (1, len(val_s)):
    #cor.append(abs(val_s[i] - val_s[i-1]))
    ant_p = float(val_s[i-1]) * por[p]
    atu = float(val_s[i])

    if(atu > ant_p):
      #print("1")
      col.append(1)
    else:
      #print("0")
      col.append(0)

  data = pd.DataFrame(col, columns=[rot[p]])
  soybean = pd.concat([soybean, data], axis=1)

print(soybean)
print("Dimensão da base original: ", soybean_st.shape)


          Date  Area Planted  Area Harvested  Yield  ...  Median  Por1  Por4  Por7
0   2014-01-10          97.2            87.4  123.4  ...    4.26     0     0     0
1   2014-02-10          97.2            87.4  123.4  ...    4.31     1     0     0
2   2014-03-10          97.2            87.4  123.4  ...    4.53     1     1     0
3   2014-04-09          97.2            87.4  123.4  ...    4.89     1     1     1
4   2014-05-09          95.4            87.7  158.8  ...    5.03     1     0     0
..         ...           ...             ...    ...  ...     ...   ...   ...   ...
68  2019-10-10          89.9            81.8  168.4  ...    3.74     1     1     0
69  2019-11-08          89.9            81.8  167.0  ...    3.87     1     0     0
70  2019-12-10          89.9            81.8  167.0  ...    3.74     0     0     0
71  2020-01-10          89.7            81.5  168.0  ...    3.87     1     0     0
72  2020-02-11          89.7            81.5  168.0  ...    3.83     0     0     0

[73

In [None]:
#Classification - Decision Tree (DT);

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

#col é a coluna da variável target (Average); indices são as colunas usadas
bases = {"Corn" : {"serie_temp": corn, "texts": corn_texts, "col": 114, "indices": [], "Model": [],  "ts": [], "tsDT": [], "ts_tx": [], "ts_txDT": []},
         "Soybean" : {"serie_temp": soybean, "texts": soybean_texts, "col": 71, "indices": [], "Model": [],  "ts": [], "tsDT": [], "ts_tx": [], "ts_txDT": []}}

for key, item in bases.items():

  base = item['serie_temp']
  col = item['col']
  ind = item['indices']

  #Separação mais ou menos 20% para teste
  #X_train = base.iloc[0:72, 1:col].values
  #y_train = base.iloc[0:72, col+2].values.ravel() # Se Por1 (+2) - Por5(+3) - Por7(+4)
  #X_test = base.iloc[60:73, 1:col].values
  #y_test = base.iloc[60:73, col+2].ravel() # Se Por1 (+2) - Por5(+3) - Por7(+4)

  aux = []

  # Ajusta (FIT) arvore decisão 10 vezes para pegar todos índices usados para criar DT.
  for i in range(0, 10):
    X = base.iloc[0:72, 1:col].values
    y = base.iloc[0:72, col+2].values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

    clf = DecisionTreeClassifier(criterion='entropy')
    clf.fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #print(key, "Acc:",  balanced_accuracy_score(y_test, y_pred))
    #print(clf.feature_importances_) #pegar o indices diferente de zero
    #print(cls.tree_.compute_feature_importances())

    val = clf.feature_importances_
    for i in range(0, len(val)):
      if(val[i] != 0 and i not in aux):
        aux.append(i)
        item['indices'].append(i)

  print(key, "Quantidade: ", len(item['indices']))
  print("Indices usados: ", item['indices'])


Corn Quantidade:  49
Indices usados:  [13, 14, 17, 19, 23, 36, 40, 41, 51, 63, 70, 76, 80, 97, 100, 111, 112, 20, 22, 73, 79, 81, 83, 90, 101, 18, 82, 102, 103, 105, 16, 25, 99, 29, 69, 24, 34, 5, 39, 72, 85, 65, 93, 108, 109, 15, 37, 45, 71]
Soybean Quantidade:  23
Indices usados:  [13, 15, 16, 19, 25, 33, 47, 65, 69, 11, 17, 28, 66, 40, 14, 39, 34, 62, 18, 26, 32, 68, 38]


# Forecasting corn and soybean prices using time series

- All atributes of times-serie (TS)
- Attributes of the time-series extracted from the decision tree (TS-DT)



In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
  y_true, y_pred = np.array(y_true), np.array(y_pred)
  return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

class forecast_ts():

  def __init__(self):
    self.hdr = ['Treino', 'Teste', 'Date', 'Average', 'Forecast']

  def prediction(self, base, model, win_len, col, ind):

    mae = 0
    mape = 0
    mse = 0
    rmse = 0
    quant = 0
    df_ts = pd.DataFrame(columns=self.hdr)

    for x in range(0, len(base) - win_len):

      win_ini = x
      win_end = win_ini + win_len
      quant += 1

      X_train = base.iloc[win_ini : win_end, ind].values
      y_train = base.iloc[win_ini : win_end, col].values.ravel()
      X_test = base.iloc[win_end : win_end + 1, ind].values
      y_test = base.iloc[win_end : win_end + 1, col].values.ravel()

      regression = model.fit(X_train, y_train)
      prediction = regression.predict(X_test)

      mae_pdt = mean_absolute_error(y_test, prediction)
      mae += mae_pdt

      mse_pdt = mean_squared_error(y_test, prediction)
      mse += mse_pdt

      rmse_pdt = sqrt(mean_squared_error(y_test, prediction))
      rmse += rmse_pdt

      mape_pdt = mean_absolute_percentage_error(y_test, prediction)
      mape += mape_pdt

      pred = round(float(prediction), 2)
      val = round(float(y_test), 2)

      df_ts = df_ts.append({'Date': base.iloc[win_end: win_end + 1, 0:1].values,
                            'Treino': win_len,
                            'Teste': quant,
                            'Average': val,
                            'Forecast': pred},
                          ignore_index=True)

    #print(df_ts)

    mae = round(mae/quant, 2)
    mse = round(mse/quant, 2)
    rmse = round(rmse/quant, 2)
    mape = round(mape/quant, 2)

    res = {'Treino': win_len,
            'Teste': quant,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'MAPE': mape}

    return res, df_ts

In [None]:
hdr = ['Treino', 'Teste', 'MAE', 'MSE', 'RMSE', 'MAPE']
hdr_price = ['Date', 'Treino', 'Teste', 'Average', 'Forecast']

Models = {"RBF (Default)": SVR(kernel='rbf'),
          "RBF (Gama: Auto)": SVR(kernel='rbf', gamma='auto'),
          "Sigmoid Fuction (Default)": SVR(kernel='sigmoid'),
          "Sigmoid Fuction (Gama: Auto)": SVR(kernel='sigmoid', gamma='auto'),
          "Polynomial Degree 3": SVR(kernel='poly', degree = 3),
          "Polynomial Degree 5": SVR(kernel='poly', degree = 5),
          "Polynomial Degree 7": SVR(kernel='poly', degree = 7)}

window = [2, 3, 6, 12, 24, 36, 48]

pred = forecast_ts()

# All atributes of times-serie (TS)
for key, item in bases.items():
  base = item['serie_temp']
  col = item['col']
  item['ts'] = []
  ind = list(range(1, col))

  print("\t\n", key, "PRICE FORECAST - TIMES SERIES (TS) ALL ATTRIBUTES\n")
  for clsf, model in Models.items():
    df = pd.DataFrame(columns=hdr)
    df_p = pd.DataFrame(columns=hdr_price)
    print("Modelo: ", clsf)
    for win in window:
      res, ts = pred.prediction(base, model, win, col, ind)
      df = df.append(res, ignore_index=True)
      df_p = df_p.append(ts, ignore_index=True)

    item['ts'].append(df_p)
    item['Model'].append(clsf)
    print(df)
    #print(df_p)
    #print(item['ts'])

#Attributes of the time-series extracted from the decision tree (TS-DT)
for key, item in bases.items():
  base = item['serie_temp']
  col = item['col']
  ind = item['indices']
  item['tsDT'] = []

  print("\t\n", key, "PRICE FORECAST - TIMES SERIES DECISION TREE ATTRIBUTES (TS/DT)\n")
  for clsf, model in Models.items():
    df = pd.DataFrame(columns=hdr)
    df_p = pd.DataFrame(columns=hdr_price)
    print("Modelo: ", clsf)
    for win in window:
      res, ts = pred.prediction(base, model, win, col, ind)
      df = df.append(res, ignore_index=True)
      df_p = df_p.append(ts, ignore_index=True)

    item['tsDT'].append(df_p)
    print(df)
    #print(df_p)
    #print(item['ts'])


	
 Corn PRICE FORECAST - TIMES SERIES (TS) ALL ATTRIBUTES

Modelo:  RBF (Default)
   Treino  Teste   MAE   MSE  RMSE  MAPE
0     2.0   71.0  0.18  0.06  0.18  4.62
1     3.0   70.0  0.20  0.08  0.20  5.20
2     6.0   67.0  0.21  0.09  0.21  5.69
3    12.0   61.0  0.15  0.04  0.15  3.93
4    24.0   49.0  0.16  0.05  0.16  4.32
5    36.0   37.0  0.15  0.04  0.15  3.94
6    48.0   25.0  0.16  0.05  0.16  4.02
Modelo:  RBF (Gama: Auto)
   Treino  Teste   MAE   MSE  RMSE  MAPE
0     2.0   71.0  0.17  0.06  0.17  4.58
1     3.0   70.0  0.19  0.07  0.19  5.08
2     6.0   67.0  0.19  0.08  0.19  5.22
3    12.0   61.0  0.15  0.04  0.15  3.90
4    24.0   49.0  0.14  0.04  0.14  3.81
5    36.0   37.0  0.13  0.04  0.13  3.49
6    48.0   25.0  0.14  0.04  0.14  3.49
Modelo:  Sigmoid Fuction (Default)
   Treino  Teste   MAE   MSE  RMSE  MAPE
0     2.0   71.0  0.18  0.06  0.18  4.62
1     3.0   70.0  0.20  0.08  0.20  5.18
2     6.0   67.0  0.21  0.10  0.21  5.70
3    12.0   61.0  0.15  0.04  0.15  3

# Combining Text (BoW) and Time Serie

- Time-series combined with text (TS/Texts);
- TS-DT combined with texts (TS-DT/Texts).

In [None]:
#Texts visualization
print(corn_texts.iloc[1:10, 4:10])
print("Dataset Dimension - Corn: ", corn_texts.shape)
print(soybean_texts.iloc[1:10, 4:10])
print("Dataset Dimenssion - Soybean: ", soybean_texts.shape)

                                               text1  ...                                              text6
1  Conab is still carrying forward all the safrin...  ...  The full-season corn crops in Sao Paulo and no...
2  The full-season corn yields are also expected ...  ...  Heavy rains have been pounding the state for s...
3  First they limited the amount of corn that cou...  ...  The corn harvest is a little more advanced at ...
4  At this point, I would say that trend line cor...  ...  For every five years a farmer in Brazil works ...
5  For Argentina, I think a 26 million ton estima...  ...  The peak storage demand for soybeans in Brazil...
6  Considering all grains, soybeans, corn, and so...  ...  In 2013, the federal government purchased mill...
7  Domestic prices for corn in Mato Grosso is bel...  ...  As a way to increase the use of refuse areas, ...
8  The total 2013/14 Brazilian corn crop is now e...  ...  Farmers in the state can choose between winter...
9  Even so, the Mat

In [None]:
class PreProcessing():

  def __init__(self, min_df, max_df, ngram):
    self.min_df = min_df
    self.max_def = max_df
    self.ngram = ngram

  def CleanText(self, data):
    dataheadlines = []

    for row in range(0, len(data)):
      aux = ''
      for col in range(0, len(data.columns)):
        text_top = str(data.iloc[row, col])
        #top_clean = text_top.strip('b\"\'')
        top_sub = re.sub('[0-9][^w]', '' , text_top)
        aux += ' ' + str(top_sub)
        #print(aux)
      dataheadlines.append(aux)

#   print(dataheadlines)
    return dataheadlines

  def remove_stopwords(self, data, domain_stopwords=[]):

    stop_words = nltk.corpus.stopwords.words('english') # lang=portuguese or english
    dataheadlines = []

    for txt in data:
      s = str(txt).lower() #tudo para caixa baixa
      table = str.maketrans({key: None for key in string.punctuation})
      s = s.translate(table) #remove pontuação
      tokens = nltk.word_tokenize(s)
      v = [i for i in tokens if not i in stop_words and not i in domain_stopwords and not i.isdigit()] #remove stopwords
      s = ""

      for token in v:
        s += token+" "
      dataheadlines.append(s.strip())

    return dataheadlines

  def BoW(self, data):

    text = self.CleanText(data)
    headlines = self.remove_stopwords(text)

#    Tfvectorizer = CountVectorizer(min_df= self.min_df, max_df= self.max_def, ngram_range = (self.ngram, self.ngram))
#    Data_Tf = Tfvectorizer.fit_transform(headlines)
#    matrix_Tf = pd.DataFrame(Data_Tf.todense(), columns=Tfvectorizer.get_feature_names())

    TfIdfvectorizer = TfidfVectorizer(min_df= self.min_df, max_df= self.max_def, ngram_range = (self.ngram, self.ngram))
    TrainingTFidf = TfIdfvectorizer.fit_transform(headlines)
    matrix_TFidf = pd.DataFrame(TrainingTFidf.todense(), columns=TfIdfvectorizer.get_feature_names())

#    print(matrix_Tf)
    return matrix_TFidf

In [None]:
class forecast_ts_texts():

  def __init__(self):
    self.hdr = ['Date', 'Treino', 'Teste', 'Average', 'Forecast']

  def prediction(self, base_st, base_tx, model, win_len, col, ind, min, max, gram):

    mae = 0
    mape = 0
    mse = 0
    rmse = 0
    quant = 0
    df_ts_tx = pd.DataFrame(columns=self.hdr)

    for x in range(0, len(base_tx) - win_len):

      win_ini = x
      win_end = win_ini + win_len
      quant += 1

      text_wndw = base_tx.iloc[win_ini : win_end + 1, 4:-1]
      PreProc = PreProcessing(min, max, gram)
      BoW_wndw = PreProc.BoW(text_wndw)

      # Data extracted from the time series
      X_train_atb = base_st.iloc[win_ini : win_end, ind].values
      X_test_atb = base_st.iloc[win_end: win_end + 1, ind].values

      # Atributes extracted from texts
      X_train_txt = BoW_wndw.iloc[0:win_len, :]
      X_test_txt = BoW_wndw.iloc[win_len:win_len+1, :]

      # Time series data and texts concatenated
      X_train = np.concatenate((X_train_atb, X_train_txt), axis=1)
      X_test = np.concatenate((X_test_atb, X_test_txt), axis=1)

      y_train = base_st.iloc[win_ini : win_end, col].values.ravel()
      y_test = base_st.iloc[win_end : win_end + 1, col].values.ravel()

      regression = model.fit(X_train, y_train)
      prediction = regression.predict(X_test)

      mae_pdt = mean_absolute_error(y_test, prediction)
      mae += mae_pdt

      mse_pdt = mean_squared_error(y_test, prediction)
      mse += mse_pdt

      rmse_pdt = sqrt(mean_squared_error(y_test, prediction))
      rmse += rmse_pdt

      mape_pdt = mean_absolute_percentage_error(y_test, prediction)
      mape += mape_pdt

      pred = round(float(prediction), 2)
      val = round(float(y_test), 2)

      df_ts_tx = df_ts_tx.append({'Date': base.iloc[win_end: win_end + 1, 0:1].values,
                                  'Treino': win_len,
                                  'Teste': quant,
                                  'Average': val,
                                  'Forecast': pred},
                                ignore_index=True)

    #print(df)

    mae = round(mae/quant, 2)
    mse = round(mse/quant, 2)
    rmse = round(rmse/quant, 2)
    mape = round(mape/quant, 2)

    res = {'Treino': win_len,
            'Teste': quant,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'MAPE': mape}

    return res, df_ts_tx


In [None]:
hdr = ['Treino', 'Teste', 'MAE', 'MSE', 'RMSE', 'MAPE']
hdr_price = ['Date', 'Treino', 'Teste', 'Average', 'Forecast']

Models = {"RBF (Default)": SVR(kernel='rbf'),
          "RBF (Gama: Auto)": SVR(kernel='rbf', gamma='auto'),
          "Sigmoid Fuction (Default)": SVR(kernel='sigmoid'),
          "Sigmoid Fuction (Gama: Auto)": SVR(kernel='sigmoid', gamma='auto'),
          "Polynomial Degree 3": SVR(kernel='poly', degree = 3),
          "Polynomial Degree 5": SVR(kernel='poly', degree = 5),
          "Polynomial Degree 7": SVR(kernel='poly', degree = 7),
          }

#Models = {"Polynomial Degree 3": SVR(kernel='poly', degree = 5)}

win = [2, 3, 6, 12, 24, 36, 48]

#parans = np.array([['001_099_2', 0.01, 0.99, 2]])

pred = forecast_ts_texts()

#Time-series combined with text (TS/Texts);
for key, item in bases.items():
  serie = item['serie_temp']
  texts = item['texts']
  col = item['col']
  item['ts_tx'] = []
  ind = list(range(1, col))

  print("\n", key, "PRICE FORECAST - TIMES SERIES ALL ATTRIBUTES AND TEXTS (TS/TEXTS) \n")
  for clsf, model in Models.items():
    df = pd.DataFrame(columns=hdr)
    df_p = pd.DataFrame(columns=hdr_price)
    print("Modelo: ", clsf)
    for win in window:
      res, ts_tx = pred.prediction(serie, texts, model, win, col, ind, 0.05, 0.95, 2)
      df = df.append(res, ignore_index=True)
      df_p = df_p.append(ts_tx, ignore_index=True)

    item['ts_tx'].append(df_p)
    #item['Model'].append(clsf)
    print(df)
    #print(df_p)
    #print(item['ts_tx'])

#TS-DT combined with texts (TS-DT/Texts).
for key, item in bases.items():
  serie = item['serie_temp']
  texts = item['texts']
  col = item['col']
  ind = item['indices']
  item['ts_txDT'] = []

  print("\t\n", key, "PRICE FORECAST - TIMES SERIES ALL ATTRIBUTES AND TEXTS (TS/TEXTS) \n")
  for clsf, model in Models.items():
    df = pd.DataFrame(columns=hdr)
    df_p = pd.DataFrame(columns=hdr_price)
    print("Modelo: ", clsf)
    for win in window:
      res, ts_tx = pred.prediction(serie, texts, model, win, col, ind, 0.05, 0.95, 2)
      df = df.append(res, ignore_index=True)
      df_p = df_p.append(ts_tx, ignore_index=True)

    item['ts_txDT'].append(df_p)
    print(df)
    #print(df_p)
    #print(item['ts_tx'])



 Corn PRICE FORECAST - TIMES SERIES ALL ATTRIBUTES AND TEXTS (TS/TEXTS) 

Modelo:  RBF (Default)
   Treino  Teste   MAE   MSE  RMSE  MAPE
0     2.0   71.0  0.18  0.06  0.18  4.62
1     3.0   70.0  0.20  0.08  0.20  5.20
2     6.0   67.0  0.21  0.09  0.21  5.69
3    12.0   61.0  0.15  0.04  0.15  3.93
4    24.0   49.0  0.16  0.05  0.16  4.32
5    36.0   37.0  0.15  0.04  0.15  3.95
6    48.0   25.0  0.16  0.05  0.16  4.03
Modelo:  RBF (Gama: Auto)
   Treino  Teste   MAE   MSE  RMSE  MAPE
0     2.0   71.0  0.17  0.06  0.17  4.49
1     3.0   70.0  0.19  0.07  0.19  4.92
2     6.0   67.0  0.18  0.07  0.18  4.92
3    12.0   61.0  0.14  0.04  0.14  3.71
4    24.0   49.0  0.13  0.03  0.13  3.62
5    36.0   37.0  0.12  0.03  0.12  3.25
6    48.0   25.0  0.13  0.04  0.13  3.24
Modelo:  Sigmoid Fuction (Default)
   Treino  Teste   MAE   MSE  RMSE  MAPE
0     2.0   71.0  0.18  0.06  0.18  4.62
1     3.0   70.0  0.20  0.08  0.20  5.18
2     6.0   67.0  0.21  0.10  0.21  5.70
3    12.0   61.0  0.1

## Results visualization

Results graph with the lowest MAPE of the TS, TS / Texts, TS-DT and TS-DT / Texts models.

In [None]:
dct = {'Corn': {},
       'Soybean': {}}

for key, item in bases.items():
  for model, ts in zip(item['Model'], item['ts']):
    dates = []
    for date in ts.Date:
      dates.append(date[0][0])

    ds = pd.DataFrame(dates)
    ts = ts.assign(Date=ds)
    dct[key][model] = {'ts': ts}

  for model, ts_tx in zip(item['Model'], item['ts_tx']):
    dates = []
    for date in ts_tx.Date:
      dates.append(date[0][0])

    ds = pd.DataFrame(dates)
    ts_tx = ts_tx.assign(Date=ds)
    dct[key][model]['ts_tx'] = ts_tx

  for model, tsDT in zip(item['Model'], item['tsDT']):
    dates = []
    for date in tsDT.Date:
      dates.append(date[0][0])

    ds = pd.DataFrame(dates)
    tsDT = tsDT.assign(Date=ds)
    dct[key][model]['tsDT'] = tsDT

  for model, ts_txDT in zip(item['Model'], item['ts_txDT']):
    dates = []
    for date in ts_txDT.Date:
      dates.append(date[0][0])

    ds = pd.DataFrame(dates)
    ts_txDT = ts_txDT.assign(Date=ds)
    dct[key][model]['ts_txDT'] = ts_txDT

df_ts = dct['Corn']['RBF (Gama: Auto)']['ts']
df_ts_tx = dct['Corn']['RBF (Gama: Auto)']['ts_tx']
df_tsDT = dct['Corn']['RBF (Gama: Auto)']['tsDT']
df_ts_txDT = dct['Corn']['RBF (Gama: Auto)']['ts_txDT']

#print(df.Date)
fig_corn = go.Figure()
fig_corn.add_trace(go.Scatter(x=df_ts.loc[df_ts.Treino==48, 'Date'], y=df_ts.loc[df_ts.Treino==48, 'Average'], mode='lines', name='Average'))
fig_corn.add_trace(go.Scatter(x=df_ts.loc[df_ts.Treino==48, 'Date'], y=df_ts.loc[df_ts.Treino==48, 'Forecast'], mode='lines', name='TS'))
fig_corn.add_trace(go.Scatter(x=df_ts_tx.loc[df_ts_tx.Treino==48, 'Date'], y=df_ts_tx.loc[df_ts_tx.Treino==48, 'Forecast'], mode='lines', name='TS/TEXT'))
fig_corn.add_trace(go.Scatter(x=df_tsDT.loc[df_tsDT.Treino==48, 'Date'], y=df_tsDT.loc[df_tsDT.Treino==48, 'Forecast'], mode='lines', name='TS/DT'))
fig_corn.add_trace(go.Scatter(x=df_ts_txDT.loc[df_ts_txDT.Treino==48, 'Date'], y=df_ts_txDT.loc[df_ts_txDT.Treino==48, 'Forecast'], mode='lines', name='TS/TEXT_DT'))
fig_corn.update_layout( title="Corn - Window Size 48: RBF (Gama: Auto)", xaxis_title="Date", yaxis_title="dollar (US$)")
fig_corn.show()

soy_ts = dct['Corn']['Polynomial Degree 7']['ts']
soy_ts_tx = dct['Corn']['RBF (Gama: Auto)']['ts_tx']
soy_tsDT = dct['Corn']['Polynomial Degree 7']['tsDT']
soy_ts_txDT = dct['Corn']['RBF (Gama: Auto)']['ts_txDT']

#print(df.Date)
fig_soy = go.Figure()
fig_soy.add_trace(go.Scatter(x=soy_ts.loc[soy_ts.Treino==24, 'Date'], y=soy_ts.loc[soy_ts.Treino==24, 'Average'], mode='lines', name='Average'))
fig_soy.add_trace(go.Scatter(x=soy_ts.loc[soy_ts.Treino==48, 'Date'], y=soy_ts.loc[soy_ts.Treino==48, 'Forecast'], mode='lines', name='TS Poly'))
fig_soy.add_trace(go.Scatter(x=soy_ts_tx.loc[soy_ts_tx.Treino==24, 'Date'], y=soy_ts_tx.loc[soy_ts_tx.Treino==24, 'Forecast'], mode='lines', name='TS/TEXT RBF'))
fig_soy.add_trace(go.Scatter(x=soy_tsDT.loc[soy_tsDT.Treino==48, 'Date'], y=soy_tsDT.loc[soy_tsDT.Treino==48, 'Forecast'], mode='lines', name='TS/DT Poly'))
fig_soy.add_trace(go.Scatter(x=soy_ts_txDT.loc[soy_ts_txDT.Treino==24, 'Date'], y=soy_ts_txDT.loc[soy_ts_txDT.Treino==24, 'Forecast'], mode='lines', name='TS/TEXT_DT RBF'))
fig_soy.update_layout( title="Soybean - Window Size 48 and 24: Polynomial and RBF", xaxis_title="Date", yaxis_title="dollar (US$)")
fig_soy.show()

