In [87]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import BayesianRidge

In [33]:
# globals

input_file_path = "../data/weather/prediction_targets_daily/sqyartViwauAaeAG.csv"
input_file_name = "sqyartViwauAaeAG"
head_size = 3
corn_types = ['CORN, GRAIN', 'CORN', 'CORN, SILAGE']
columns = ["date", "tavg", "tmin", "tmax", "prcp"]

In [70]:
# read weather data

df_train_whole = pd.read_csv(filepath_or_buffer="../data/intermedier/weather_averages_per_county.csv")
df_train_whole['Month'] = pd.to_datetime(df_train_whole['Month'], format='%B').dt.month
df_train_whole['Date'] = pd.to_datetime(df_train_whole['Year'].astype(str) + '-' + df_train_whole['Month'].astype(str), format='%Y-%m')
df_train_whole = df_train_whole.groupby('County')
df_train_whole = df_train_whole.apply(lambda x: x.sort_values('Date', ascending=True))
df_train_whole = df_train_whole.reset_index(drop=True)
df_train_whole["Diff"] = df_train_whole["Tmax"] - df_train_whole["Tmin"]  # new feature
df_train_whole.head(head_size)

Unnamed: 0,Year,Month,County,Tmin,Tmax,Tavg,Pcp,Date,Diff
0,1895,1,Aitkin County,-23.944444,-12.222222,-18.111111,14.224,1895-01-01,11.722222
1,1895,2,Aitkin County,-22.444444,-7.333333,-14.888889,10.668,1895-02-01,15.111111
2,1895,3,Aitkin County,-13.611111,1.611111,-6.0,4.826,1895-03-01,15.222222


In [35]:
# create normalizer for the input data

input_scaler = MinMaxScaler().fit(df_train_whole[["Tmin", "Tmax", "Tavg", "Pcp", "Diff"]].values)

In [36]:
# load target data

df_train_target = pd.read_csv(filepath_or_buffer="../data/agri/minnesota_county_yearly_agricultural_production.csv")
df_train_target = df_train_target[df_train_target['Crop'].isin(['CORN, GRAIN', 'CORN, CORN', 'SILAGE'])]
df_train_target = df_train_target.groupby(['County', 'Year'])
df_train_target = df_train_target.agg({'YIELD, MEASURED IN BU / ACRE': 'sum'}).reset_index()
df_train_target = df_train_target.groupby('County')
df_train_target = df_train_target.apply(lambda x: x.sort_values('Year', ascending=True))
df_train_target = df_train_target.reset_index(drop=True)
df_train_target["County"] = df_train_target["County"].str.capitalize()
df_train_target.head(head_size)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_train_target = df_train_target.apply(lambda x: x.sort_values('Year', ascending=True))


Unnamed: 0,County,Year,"YIELD, MEASURED IN BU / ACRE"
0,Aitkin,1950,15.0
1,Aitkin,1951,32.0
2,Aitkin,1952,32.0


In [37]:
# load technological trend modifier

with open("../data/intermedier/trend.pkl", 'rb') as f:

    df_tech_mod = pickle.load(f)

pd.Series(df_tech_mod, name="tech modifier")

Year
1866     1.749363
1867     1.749363
1868     1.749363
1869     1.749363
1870     1.749363
          ...    
2018    11.073510
2019    11.202443
2020    11.331375
2021    11.460308
2022    11.589240
Name: tech modifier, Length: 157, dtype: float64

In [38]:
# prediction target ⇒ station ⇒ county

df_mapper_p_t_stat = pd.read_csv(filepath_or_buffer="../data/intermedier/prediction_target_station_matching.csv")
county = df_mapper_p_t_stat[df_mapper_p_t_stat["Prediction Target"] == input_file_name]["County"].values[0].split(" ")
print(county)

['Chisago', 'County']


In [66]:
# assemble training data

X = df_train_whole[df_train_whole["County"] == " ".join(county)].copy()
X.drop(["Month", "Date", "County"], axis=1, inplace=True)
X.set_index("Year", inplace=True)

X.head(head_size)

Unnamed: 0_level_0,Tmin,Tmax,Tavg,Pcp,Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1895,-21.166667,-10.388889,-15.777778,33.02,10.777778
1895,-19.166667,-6.444444,-12.777778,13.462,12.722222
1895,-9.666667,3.333333,-3.166667,17.272,13.0


In [65]:
# assemble training target

Y = df_train_target[df_train_target["County"] == county[0]].copy()
Y.drop("County", axis=1, inplace=True)
Y.set_index("Year", inplace=True)
Y.rename(columns={"YIELD, MEASURED IN BU / ACRE": "Yield"}, inplace=True)

for i in Y.index:
    trend_modifier = df_tech_mod[df_tech_mod.index == i].values[0]
    Y.loc[i].Yield = Y.loc[i].Yield / trend_modifier

Y.head(head_size)

Unnamed: 0_level_0,Yield
Year,Unnamed: 1_level_1
1950,9.48909
1951,13.344028
1952,18.108938


In [64]:
# find required time interval

Y_start = Y.index[0]
Y_end = Y.index[-1]

X_start = X.index[0]
X_end = X.index[-1]

print(X_start, X_end)
print(Y_start, Y_end)

X_train = X.loc[Y_start: Y_end]

1895 2023
1950 2021


In [84]:
# prepare final form of training data

X_train_windowed = []
for k in np.unique(X_train.index.values):
    tmp = input_scaler.transform(X_train.loc[k].values)
    X_train_windowed.append(tmp.T) # 5 x 12 -> featurs are in the rows, time grows with column index

# years * 5 feature * 12 months
X_train_windowed

[array([[0.19259259, 0.28465608, 0.4031746 , 0.56084656, 0.73015873,
         0.84550265, 0.88042328, 0.84126984, 0.79470899, 0.7026455 ,
         0.46455026, 0.27513228],
        [0.2       , 0.29714286, 0.35619048, 0.47333333, 0.67238095,
         0.8152381 , 0.83809524, 0.8047619 , 0.74857143, 0.65714286,
         0.38571429, 0.23428571],
        [0.1863354 , 0.28467909, 0.373706  , 0.51552795, 0.70600414,
         0.84057971, 0.86956522, 0.83229814, 0.77846791, 0.68426501,
         0.42028986, 0.24534161],
        [0.11839593, 0.05092298, 0.19987269, 0.25716104, 0.25779758,
         0.10311903, 0.15786123, 0.08402292, 0.16740929, 0.13049013,
         0.10630172, 0.1094844 ],
        [0.57352941, 0.62867647, 0.44485294, 0.34926471, 0.52941176,
         0.68014706, 0.64705882, 0.65441176, 0.59926471, 0.56617647,
         0.34558824, 0.41911765]]),
 array([[0.22962963, 0.33544974, 0.39153439, 0.62222222, 0.78624339,
         0.82539683, 0.9005291 , 0.86878307, 0.75555556, 0.67619048,


In [85]:
# prepare final form of training target

Y_train = Y.values
Y_train

array([[ 9.48909046],
       [13.34402846],
       [18.10893801],
       [16.99536766],
       [15.93056752],
       [16.26698436],
       [19.48245157],
       [18.38792692],
       [17.37796694],
       [15.86618362],
       [13.35030082],
       [15.84166025],
       [17.12822197],
       [15.56919677],
       [11.9187927 ],
       [13.9148083 ],
       [17.62409095],
       [15.11799087],
       [15.34509821],
       [13.24693471],
       [16.58222114],
       [15.75688215],
       [16.72301045],
       [16.12429823],
       [ 7.85115432],
       [11.24893513],
       [ 8.07656686],
       [16.46715633],
       [13.08270033],
       [13.56461034],
       [14.09118003],
       [13.00966277],
       [13.05982361],
       [12.9555942 ],
       [13.45330718],
       [13.93219809],
       [14.82511424],
       [15.26156109],
       [ 6.38398127],
       [12.95254234],
       [13.66669356],
       [12.24920016],
       [11.26758379],
       [ 6.87880784],
       [13.28465636],
       [12

In [89]:
# Ide jöhet bármilyen modell

ValueError: Found array with dim 3. BayesianRidge expected <= 2.