In [None]:
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import palettable


data = pd.read_parquet("data/raw/france.parquet")
data.dropna(axis=0, how='any', inplace=True)

data["id"] = [i for i in range(len(data))]
data["time"] = data.index
data = data.set_index("id")
all_years = []
all_month = []
for t in range(len(data)):
    all_years.append(data["time"][t].year)
    all_month.append(data["time"][t].month)
data["year"] = all_years
data["month"] = all_month
data = data.drop(["time"], axis=1)

position = pd.read_csv("data/raw/postesSynop.csv", sep=";")

Id = position["ID"].astype(str)
for i in range(len(Id)):
    if len(Id[i]) < 5:
        Id[i] = '0' + Id[i]

production = pd.read_parquet("data/raw/franceagrimer-rdts-surfs-multicrops.parquet")
production = production.drop(production[production["n_dep"] == "2A"].index)
production = production.drop(production[production["n_dep"] == "2B"].index)
production = production.drop(production[production["n_dep"].astype(int) > 95].index)

provinces = {7005: 80, 7015: 59, 7020: 50, 7027: 14, 7037: 76,
             7072: 51, 7110: 29, 7117: 22, 7130: 35, 7139: 61,
             7149: 91, 7168: 10, 7181: 54, 7190: 67, 7207: 56,
             7222: 44, 7240: 37, 7255: 18, 7280: 21, 7299: 68,
             7314: 17, 7335: 86, 7434: 87, 7460: 63, 7471: 43,
             7481: 69, 7510: 33, 7535: 46, 7558: 12, 7577: 26,
             7591: 5,  7607: 40, 7621: 65, 7627: 9,  7630: 31,
             7643: 34, 7650: 13, 7661: 83, 7690: 6,  7747: 66,
             7761: 91, 67005: 10}

stations = data["id_sta"].unique()
unwanted_stations = []
for i in stations:
    if i not in provinces:
        unwanted_stations.append(i)
for i in unwanted_stations:
    data = data.drop(data[data["id_sta"] == i].index)

temp_province = []
for i in data["id_sta"]:
    temp_province.append(provinces[i])
data["province"] = temp_province
data = data.drop(["id_sta"], axis=1)

years = data["year"].unique()
provinces = data["province"].unique()
crops = production["crop"].unique()
n_deps = production["n_dep"].unique()

working_month = {"OP" : [3, 4, 5, 6, 7, 8],                   "CZH": [9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7],
                 "BTH": [9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7], "TS" : [3, 4, 5, 6, 7, 8, 9, 10, 11],
                 "BTP": [2, 3, 4, 5, 6, 7, 8],                "BDP": [2, 3, 4, 5, 6, 7, 8],
                 "BDH": [9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7], "OH" : [9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7],
                 "MA" : [4, 5, 6, 7, 8, 9, 10, 11]}

sns.set(rc={'figure.figsize': (11, 9)})


def read_in_Y(crop, consider_part):
    map_crop = production['crop'].map(lambda x: x == crop)
    crop_value = production[map_crop]

    for n in n_deps:
        map_province = crop_value['n_dep'].map(lambda x: x == n)
        crop_n_value = crop_value[map_province]

        for y in years:
            if len(crop_n_value[consider_part + "_" + str(y)].values):
                rdt_value = crop_n_value[consider_part + "_" + str(y)].values[0]
                if rdt_value:
                    crop_Y_year[crop + "_" + str(int(n)) + "_" + str(y)] = rdt_value


def X_devide_region(consider_X):
    for p in provinces:
        temp_p_data = data[data["province"].map(lambda x: x == p)]
        for y in years:
            temp_py_data = temp_p_data[temp_p_data["year"].map(lambda x: x == y)]
            for m in range(1, 13):
                temp_pym_data = temp_py_data[temp_py_data["month"].map(lambda x: x == m)]
                if consider_X[0] in temp_pym_data and temp_pym_data[consider_X[0]].tolist():
                    for x in consider_X:
                        name = str(p) + "_" + str(y) + "_" + str(m)
                        if name not in X_region_year_month:
                            X_region_year_month[name] = [temp_pym_data[x].tolist()]
                        else:
                            X_region_year_month[name].append(temp_pym_data[x].tolist())


def normalize_X():
    for i in X_region_year_month:
        X_region_year_month_normalized[i] = []
        for j in X_region_year_month[i]:
            temp_array = np.array(j)
            max_X, min_X = max(temp_array), min(temp_array)
            if max_X - min_X > 1.0e-15:
                X_region_year_month_normalized[i].append(((temp_array - min_X) / (max_X - min_X)).tolist())
            else:
                X_region_year_month_normalized[i].append([len(j) - 1 for _ in j])


def init_list(crop, consider_part, province, month):
    X = []
    Y = []

    temp_X = {}
    for i in X_region_year_normalized_average:
        temp_X[i] = X_region_year_normalized_average[i][consider_part]

    for i in temp_X:
        p, y, m = i.split("_")
        if int(p) == province and int(m) == month:
            name = crop + "_" + p + "_" + y
            if name in crop_Y_year:
                X.append(temp_X[i])
                Y.append(crop_Y_year[name])

    return np.array(X), np.array(Y)


def correlation(X, Y):
    if len(X) < 3 or len(Y) < 3:
        return 0.0

    avg_X = np.average(X)
    avg_Y = np.average(Y)

    cr = 0
    nx = 0
    ny = 0
    for i in range(len(X)):
        cr += (X[i] - avg_X) * (Y[i] - avg_Y)
        nx += (X[i] - avg_X) ** 2
        ny += (Y[i] - avg_Y) ** 2

    if math.sqrt(nx) * math.sqrt(ny) > 1.0e-15:
        r = cr / (math.sqrt(nx) * math.sqrt(ny))
    else:
        r = 0.0

    return r


consider_parts = ["rr24", "t_avg"]

X_region_year_month = {}
X_region_year_month_normalized = {}
X_region_year_normalized_average = {}
crops_Y_year = {}

X_devide_region(consider_parts)
normalize_X()
for i in X_region_year_month_normalized:
    X_region_year_normalized_average[i] = []
    for j in range(len(X_region_year_month_normalized[i])):
        X_region_year_normalized_average[i].append(np.average(X_region_year_month_normalized[i][j]))

Y_province = {}
for crop in crops:
    if crop not in crops_Y_year:
        crop_Y_year = {}
        read_in_Y(crop, "rdt")
        crops_Y_year[crop] = crop_Y_year

for p in provinces:
    consider_parts_list = []
    correlations = {}
    correlations_list = []

    for cp in range(len(consider_parts)):
        for m in range(1, 13):
            correlations_list.append([])
            consider_parts_list.append(consider_parts[cp] + "_" + str(m))
            for crop in crops:
                crop_Y_year = crops_Y_year[crop]
                X, Y = init_list(crop, cp, p, m)
                correlations[crop + "_" + consider_parts[cp] + "_" + str(m)] = correlation(X, Y)
                correlations_list[-1].append(correlations[crop + "_" + consider_parts[cp] + "_" + str(m)])

    c_plot = pd.DataFrame(data=correlations_list, columns=crops)
    c_plot["type"] = consider_parts_list
    c_plot = c_plot.set_index("type")

    savepath = "img/"
    img = sns.heatmap(c_plot, fmt=".2f", annot=True, cmap=palettable.cmocean.diverging.Curl_10.mpl_colors)
    plt.savefig(savepath + str(p) + ".png")
    plt.close("all")


print("end")