In [1]:
import math
import numpy as np
import pandas as pd
from itertools import permutations

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_parquet("data/raw/france.parquet")
data.dropna(axis=0, how='any', inplace=True)

position = pd.read_csv("data/raw/postesSynop.csv", sep=";")

Id = position["ID"].astype(str)
for i in range(len(Id)):
    if len(Id[i]) < 5:
        Id[i] = '0' + Id[i]

production = pd.read_parquet("data/raw/franceagrimer-rdts-surfs-multicrops.parquet")
production = production.drop(production[production["n_dep"] == "2A"].index)
production = production.drop(production[production["n_dep"] == "2B"].index)
production = production.drop(production[production["n_dep"].astype(int) > 95].index)

provinces = {7005: 80, 7015: 59, 7020: 50, 7027: 14, 7037: 76, 
             7072: 51, 7110: 29, 7117: 22, 7130: 35, 7139: 61, 
             7149: 91, 7168: 10, 7181: 54, 7190: 67, 7207: 56, 
             7222: 44, 7240: 37, 7255: 18, 7280: 21, 7299: 68, 
             7314: 17, 7335: 86, 7434: 87, 7460: 63, 7471: 43, 
             7481: 69, 7510: 33, 7535: 46, 7558: 12, 7577: 26, 
             7591: 5,  7607: 40, 7621: 65, 7627: 9,  7630: 31, 
             7643: 34, 7650: 13, 7661: 83, 7690: 6,  7747: 66, 67005: 10}

crops = production["crop"].unique()
stations = data["id_sta"].unique()

In [3]:
lr = LinearRegression()

# Double Elements Linear Regression All Try

In [4]:
def init_x(data_list, consider_name):
    for i in stations:
        if i in provinces:
            data_station = data[data["id_sta"] == i]
            year_position = 0

            for j in range(2017, 2023):
                one_data = [0 for i in range(len(data_list))]
                for k in range(year_position, len(data_station)):
                    if data_station.index[k].year == j:
                        for d in range(len(one_data)):
                            one_data[d] += data_station[consider_name[d]][k]
                    else:
                        year_position = k
                        break
                for d in range(len(one_data)):
                    data_list[d][str(provinces[i]) + "_" + str(j)] = one_data[d]


def init_y(data_list):
    for i in data_list[0]:
        n_dep0, year = i.split('_')
        r_year = production['n_dep'].map(lambda x: x == n_dep0)
        r_crop = production['crop'].map(lambda x: x == crop)
        rdt = production[r_year & r_crop]["rdt_" + year].values
        if rdt.size > 0:
            if rdt[0]:
                total_rdt[i] = rdt[0]


def init_list(data_list):
    temp_data_list = []
    temp_rdt_list = []

    for i in total_rdt:
        if i in data_list[0]:
            temp_data_list.append(np.array([data_list[j][i] for j in range(len(data_list))]))
            temp_rdt_list.append(total_rdt[i])

    temp_rdt_list = np.array(temp_rdt_list)

    return temp_data_list, temp_rdt_list


def add_degreed_data(data_list, degrees):
    temp_data_list = data_list.copy()
    
    for i in range(len(data_list)):
        temp_list = []
        for j in range(len(data_list[i])):
            for d in range(1, degrees[j] + 1):
                temp_list.append(data_list[i][j] ** d)
        temp_data_list[i] = temp_list

    return np.array(temp_data_list)


def init(data_list):
    init_y(data_list)
    return init_list(data_list)

In [5]:
def predict_n(times, data_number):
    sum_RMSE = 0
    coef = np.array([0.0 for i in range(data_number)])

    for i in range(times):
        X_train, X_test, y_train, y_test = train_test_split(data_array, rdt_array, test_size=0.2)
        lr.fit(X_train, y_train)
        coef += lr.coef_
        y_predict = lr.predict(X_test)

        RMSE = math.sqrt(((y_predict - y_test) ** 2).sum() / len(y_test))
        rRMSE = RMSE / y_test.mean()

        sum_RMSE += rRMSE

    return sum_RMSE / times, coef / times

##  rr24 + DJ_0

In [12]:
start = 1
end = 12
times = 1000
total_rain = {}
total_rad_0 = {}
consider_list = [total_rain, total_rad_0]

rRMSE_degree = {}
coeffs = {}
permutation_list = list(permutations([i for i in range(start, end)], 2))
permutation_list.insert(0, (start, start))
permutation_list.append((end - 1, end - 1))

init_x(consider_list, ["rr24", "DJ_0"])

for crop in crops:
    total_rdt = {}
    temp_data_array, rdt_array = init(consider_list)

    for permut_param in permutation_list:
        data_array = add_degreed_data(temp_data_array, permut_param)
        str_permut_param = "_" + str(permut_param)[1: -1].replace(", ", "_")
        rRMSE_degree[crop + str_permut_param], coeffs[crop + str_permut_param] = predict_n(times, sum(permut_param))

In [13]:
best_weighted_degrees = {}
temp_best_weighted_rRMSE = {}

for r in rRMSE_degree:
    crop = r.split("_")[0]
    rRMSE = r.split("_")[1:]
    if crop not in temp_best_weighted_rRMSE or temp_best_weighted_rRMSE[crop] > rRMSE_degree[r]:
        temp_best_weighted_rRMSE[crop] = rRMSE_degree[r]
        best_weighted_degrees[crop] = [int(i) for i in rRMSE]

best_weighted_rRMSE = {}
for crop in temp_best_weighted_rRMSE:
    best_weighted_rRMSE[crop + "_" + str(best_weighted_degrees[crop])[1: -1].replace(", ", "_")] = temp_best_weighted_rRMSE[crop]

In [14]:
best_weighted_rRMSE

{'OP_4_3': 0.2254742055459391,
 'CZH_4_2': 0.17578495614477627,
 'BTH_4_3': 0.18763185026265206,
 'TS_3_2': 0.19390791755679662,
 'BTP_4_3': 0.2557370932506683,
 'BDP_3_4': 0.21575690698785,
 'BDH_2_8': 0.215987861164541,
 'OH_5_3': 0.16641979431051712,
 'MA_3_1': 0.18638299163474906}

## rr24 + DJ_6

In [15]:
start = 1
end = 12
times = 1000
total_rain = {}
total_rad_6 = {}
consider_list = [total_rain, total_rad_6]

rRMSE_degree = {}
coeffs = {}
permutation_list = list(permutations([i for i in range(start, end)], 2))
permutation_list.insert(0, (start, start))
permutation_list.append((end - 1, end - 1))

init_x(consider_list, ["rr24", "DJ_6"])

for crop in crops:
    total_rdt = {}
    temp_data_array, rdt_array = init(consider_list)

    for permut_param in permutation_list:
        data_array = add_degreed_data(temp_data_array, permut_param)
        str_permut_param = "_" + str(permut_param)[1: -1].replace(", ", "_")
        rRMSE_degree[crop + str_permut_param], coeffs[crop + str_permut_param] = predict_n(times, sum(permut_param))

In [16]:
best_weighted_degrees = {}
temp_best_weighted_rRMSE = {}

for r in rRMSE_degree:
    crop = r.split("_")[0]
    rRMSE = r.split("_")[1:]
    if crop not in temp_best_weighted_rRMSE or temp_best_weighted_rRMSE[crop] > rRMSE_degree[r]:
        temp_best_weighted_rRMSE[crop] = rRMSE_degree[r]
        best_weighted_degrees[crop] = [int(i) for i in rRMSE]

best_weighted_rRMSE = {}
for crop in temp_best_weighted_rRMSE:
    best_weighted_rRMSE[crop + "_" + str(best_weighted_degrees[crop])[1: -1].replace(", ", "_")] = temp_best_weighted_rRMSE[crop]

In [17]:
best_weighted_rRMSE

{'OP_3_4': 0.2231814761784737,
 'CZH_4_1': 0.17454503413035688,
 'BTH_4_3': 0.18379178287037617,
 'TS_3_1': 0.192653433263517,
 'BTP_3_4': 0.25209740399482516,
 'BDP_2_7': 0.21519061768822242,
 'BDH_2_7': 0.213916990329565,
 'OH_3_4': 0.16484429092579345,
 'MA_3_1': 0.18746966576718754}