# Mean Init

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
import time
import math
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

import seaborn as sns

In [2]:
BASE_DIR = "/home/ykawakami/work_space/Research/Programs/dac2022/embeddings_dimensional_reduction/"

DATASET_LABEL = "qsar_biogradation"
MISSING_PATTERN = "MCAR"
DESIRED_MISSING_RATIO = 0.03

INIT = "mean_init_base"
METHOD = "knn_opt"

TEST_NUMBER = 0

In [3]:
files_dict = {
    "load": {
        "dirs_path": os.path.join(BASE_DIR, "dataset" ,DATASET_LABEL, MISSING_PATTERN, str(TEST_NUMBER)),
        "defected_dataset_name": f'defected_dataset_desied_missing_ratio_{DESIRED_MISSING_RATIO}.csv',
        "missing_index_name": f'missing_index_desied_missing_ratio_{DESIRED_MISSING_RATIO}.csv',
        "missing_values_name": f'missing_values_desied_missing_ratio_{DESIRED_MISSING_RATIO}.csv',
    },
    "save": {
        "dirs_path": os.path.join(BASE_DIR, "dataset" ,DATASET_LABEL, MISSING_PATTERN, str(TEST_NUMBER)),
        "file_name": f"mae_{INIT}_{METHOD}_{DESIRED_MISSING_RATIO}.csv",
    },
}

In [4]:
load_paths = files_dict["load"]

dirs_path = load_paths["dirs_path"]
defected_dataset_name = load_paths["defected_dataset_name"]
missing_index_name = load_paths["missing_index_name"]
missing_values_name = load_paths["missing_values_name"]

df_data = pd.read_csv(
    os.path.join(
        dirs_path, 
        defected_dataset_name
    ),
    header=None,
)

df_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,3.919,2.6909,0.0,0.0,0.0,0.0,0.0,31.4,2.0,0.0,...,0.0,0.0,0.0,0.0,2.949,1.591,0.0,7.253,0.0,0.0
1,4.170,2.1144,0.0,0.0,0.0,0.0,0.0,30.8,1.0,1.0,...,0.0,0.0,0.0,0.0,3.315,1.967,0.0,7.257,0.0,0.0
2,3.932,3.2512,0.0,0.0,0.0,0.0,0.0,26.7,2.0,4.0,...,0.0,0.0,0.0,1.0,3.076,2.417,0.0,7.601,0.0,0.0
3,3.000,2.7098,0.0,0.0,0.0,0.0,0.0,20.0,0.0,2.0,...,0.0,0.0,0.0,1.0,3.046,5.000,0.0,6.690,0.0,0.0
4,0.000,3.3944,0.0,0.0,0.0,0.0,0.0,29.4,2.0,4.0,...,0.0,0.0,0.0,0.0,3.351,2.405,0.0,8.003,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,5.431,2.8955,0.0,0.0,0.0,2.0,0.0,32.1,4.0,1.0,...,2.0,0.0,6.0,1.0,3.573,2.242,1.0,8.088,0.0,0.0
1051,5.287,3.3732,0.0,0.0,9.0,0.0,0.0,35.3,0.0,9.0,...,0.0,0.0,3.0,0.0,3.787,3.083,3.0,9.278,0.0,0.0
1052,4.869,1.7670,0.0,1.0,9.0,0.0,5.0,44.4,0.0,4.0,...,0.0,4.0,13.0,0.0,3.848,2.576,5.0,9.537,1.0,0.0
1053,5.158,1.6914,2.0,0.0,36.0,0.0,9.0,56.1,0.0,0.0,...,0.0,1.0,16.0,0.0,5.808,2.055,8.0,11.055,0.0,1.0


In [26]:
# Load Labels
missing_index = pd.read_csv(
    os.path.join(
        dirs_path, 
        missing_index_name
    ),
    header=None,
)
missing_values = pd.read_csv(
    os.path.join(
        dirs_path, 
        missing_values_name
    ),
    header=None,
)
# missing_index, missing_values

In [34]:
row_count_of_missings = (missing_index == 0).astype(int).sum(axis=0).values

In [48]:
df_data = df_data[missing_index == 1].fillna(df_data[missing_index == 1].mean())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,0.129971,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.786635,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,,,,,0.970443,,,,,,...,,,,,,,,,,
1051,,,,,,,,,,,...,,0.890078,,,,,,,,
1052,,,,,,,,,,,...,,,,,,,,,,
1053,,,,,,,,,,,...,,,,,,,,,,


In [53]:
from sklearn.metrics import mean_absolute_error

def mae_wrapper(result_data, missing_index, missing_values) -> float:
    mae = mean_absolute_error(
        missing_values[missing_values == 0],
        result_data[missing_values == 0]
    )
    
    return mae


In [54]:
#距離を計測する関数
from sklearn.neighbors import NearestNeighbors

def neighberhood_calculation(train_data, user_k, item_k):
    
    user_nbrs = NearestNeighbors(n_neighbors=user_k+1, n_jobs=-1)
    item_nbrs = NearestNeighbors(n_neighbors=item_k+1, n_jobs=-1)
    
    user_nbrs.fit(train_data)
    user_neighbor_matrix = user_nbrs.kneighbors(train_data, return_distance=False)[:, 1:]
    
    item_nbrs.fit(train_data.T)
    item_neighbor_matrix = item_nbrs.kneighbors(train_data.T, return_distance=False)[:, 1:]
    
    return user_neighbor_matrix, item_neighbor_matrix


In [55]:
# - ユーザ i （欠損値持ち）を近傍に持っている
# - 欠損値を持つ
# を満たすユーザ j （人数不明）の `defected_column` 変数 の総和 -> ここに使用

# i 行目に対して、i 行目を近傍にもち、欠損値を持っている行を格納する

def inverse_neighborhood_calculation(
    user_neighbor_matrix, 
    item_neighbor_matrix, 
    defected_data_row_index_set, 
    defected_data_column_location_set
):
    inverse_user_neighbor_list = [[] for _ in range(user_neighbor_matrix.shape[0])]
    inverse_item_neighbor_list = [[] for _ in range(item_neighbor_matrix.shape[0])]
    
    for i, neighbor_list in enumerate(user_neighbor_matrix):
        if i in defected_data_row_index_set: 
            for neighbor in neighbor_list:
                if neighbor in defected_data_row_index_set: 
                    inverse_user_neighbor_list[neighbor].append(i)
                
    for i, neighbor_list in enumerate(item_neighbor_matrix):
        if i in defected_data_column_location_set: 
            for neighbor in neighbor_list:
                if neighbor in defected_data_column_location_set: 
                    inverse_item_neighbor_list[neighbor].append(i)
    
    return inverse_user_neighbor_list, inverse_item_neighbor_list

In [None]:
# step2bを記述
# 補完処理
def neighbor_base_inputation(
    df_data, 
    test_label_gender, 
    test_label_age, 
    defected_columns, 
    user_neighbor_matrix, 
    inverse_user_neighbor_list, 
    item_neighbor_matrix, 
    inverse_item_neighbor_list, 
    user_weight,
    item_weight,
    user_k,
    item_k
):
    df_train_data = df_data.copy()
    
    defected_data_index_dict = {
        "gender": df_data[
            df_data["user_id"].isin(test_label_gender["user_id"])
        ].index.values,
        "age": df_data[
            df_data["user_id"].isin(test_label_age["user_id"])
        ].index.values
    }  # 欠損ありデータのIndex
    defected_data_column_location_dict = {
        "gender": df_train_data.columns.get_loc("gender"),
        "age": df_train_data.columns.get_loc("age"),
    }
    defected_column_value_dict = {
        "gender": df_data["gender"].values,
        "age": df_data["age"].values,
    }
    for defected_column in defected_columns:
        #分子部分
        molecule = np.zeros(len(defected_data_index_dict[defected_column]))
        
        if user_k > 1:
            # 3項目: 欠損値を持つ ユーザ i に対する 全 k 人の近傍ユーザ j の `defected_column` 変数 の総和
            # molecule += (z_user[kesson_row] @ df[:, kesson_column]) * sigma
            molecule += defected_column_value_dict[defected_column][
                user_neighbor_matrix[
                    defected_data_index_dict[defected_column]
                ]
            ].sum(axis=1) * user_weight

            # 4項目: 
            # - ユーザ i を近傍に持っている
            # - 欠損値を持つ
            # を満たすユーザ j （人数不明）の `defected_column` 変数 の総和
            # molecule += (
            #     (z_user[:, kesson_row] * t_user) @ (df[:, kesson_column] * t_user)
            # ) * sigma
            for molecule_index, defected_data_index in enumerate(defected_data_index_dict[defected_column]):
                molecule[molecule_index] += defected_column_value_dict[defected_column][
                    inverse_user_neighbor_list[defected_data_index]
                ].sum() * user_weight
        elif user_k == 1:
            # 3項目: 欠損値を持つ ユーザ i に対する 全 k 人の近傍ユーザ j の `defected_column` 変数 の総和
            molecule += defected_column_value_dict[defected_column][
                user_neighbor_matrix[
                    defected_data_index_dict[defected_column]
                ]
            ].squeeze() * user_weight

            # 4項目: 
            # - ユーザ i を近傍に持っている
            # - 欠損値を持つ
            # を満たすユーザ j （人数不明）の `defected_column` 変数 の総和
            for molecule_index, defected_data_index in enumerate(defected_data_index_dict[defected_column]):
                tmp = defected_column_value_dict[defected_column][
                    inverse_user_neighbor_list[defected_data_index]
                ]
                if tmp != []:
                    molecule[molecule_index] += tmp.sum() * user_weight
        
        if item_k > 1:
            #５項目: 欠損値を持つ `defected_column`アイテム i に対する 全 k 個の近傍アイテム j の `defected_column` 変数 の総和
            # molecule += (z_item[kesson_column] @ df[kesson_row]) * lamda
            for item_neighbor in item_neighbor_matrix[defected_data_column_location_dict[defected_column]]:
                molecule += df_train_data.values[
                    defected_data_index_dict[defected_column],
                    item_neighbor
                ] * item_weight
            
            #6項目: 
            # - アイテム i を近傍に持っている
            # - 欠損値を持つ
            # を満たすアイテム j （個数不明）の `defected_column` 変数 の総和
            # molecule += (
            #     (z_item[:, kesson_column] * t_item) @ (df[kesson_row] * t_item)
            # ) * lamda
            for inverse_item_neighbor in inverse_item_neighbor_list[defected_data_column_location_dict[defected_column]]:
                molecule += df_train_data.values[
                    defected_data_index_dict[defected_column], 
                    inverse_item_neighbor
                ] * item_weight

        elif item_k == 1:
            #５項目: 欠損値を持つ `defected_column`アイテム i に対する 全 k 個の近傍アイテム j の `defected_column` 変数 の総和
            molecule += df_train_data.values[
                defected_data_index_dict[defected_column], 
                item_neighbor_matrix[
                    defected_data_column_location_dict[defected_column]
                ]
            ].squeeze() * item_weight
            
            #6項目: 
            # - アイテム i を近傍に持っている
            # - 欠損値を持つ
            # を満たすアイテム j （個数不明）の `defected_column` 変数 の総和
            # molecule += (
            #     (z_item[:, kesson_column] * t_item) @ (df[kesson_row] * t_item)
            # ) * lamda
            tmp_column = inverse_item_neighbor_list[
                defected_data_column_location_dict[defected_column]
            ]
            if tmp_column != []:
                tmp = df_train_data.values[
                    defected_data_index_dict[defected_column], 
                    inverse_item_neighbor_list[
                        defected_data_column_location_dict[defected_column]
                    ]
                ]
                if len(tmp) > 1:
                    molecule += tmp.sum(axis=1) * item_weight
                elif len(tmp) == 1:
                    molecule += tmp * item_weight
        
        
        #分母計算
        denominator = np.zeros(len(defected_data_index_dict[defected_column]))

        # 3項目: ユーザ近傍数 user_k
        denominator += user_k * user_weight
        
        # 4項目: 
        # - ユーザ i を近傍に持っている
        # - 欠損値を持つ
        # を満たすユーザ j （人数不明）の 人数
        # denominator += sum(z_user[:, kesson_row]*t_user) * sigma
        for denominator_index, defected_data_index in enumerate(defected_data_index_dict[defected_column]):
            denominator[denominator_index] += len(inverse_user_neighbor_list[defected_data_index]) * user_weight
            
        # 5項目: アイテム近傍数 item_k
        denominator += item_k * item_weight
        
        # 6項目: 
        # - アイテム i を近傍に持っている
        # - 欠損値を持つ
        # を満たすアイテム j （個数不明）の 個数
        # denominator += sum(z_item[:, kesson_column] * t_item) * lamda
        denominator += len(
            inverse_item_neighbor_list[
                defected_data_column_location_dict[defected_column]
            ]
        ) * item_weight

        W = molecule / denominator
        
        df_data.loc[
            defected_data_index_dict[defected_column], 
            [defected_column]
        ] = W
        
    return df_data


# Interactive Init