In [None]:
NRANK = 10
THRESHOLD = 0.35

In [None]:
METADATA = {"outputdir": "image_executed", "prefix": "recommenderSystem", 
              "dataname":"group131415_div1", "nrank": NRANK, "threshold": THRESHOLD}

In [None]:
import pandas as pd

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 60)


def get_data():
    ROOT = ".."
    df = pd.read_csv(f"{ROOT}/data/group131415_div1.csv", index_col=[0])
    print(df.shape)
    return df


g_df = get_data()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
%matplotlib inline


def plot_df_heatmap(df, metadata= METADATA):
    """plot headmap of df data

    Args:
        df (pd.DataFrame): data
        metadata (dict): 表示用データ. Defaults to METADATA.
    """
    size = np.array(df.shape)[::-1]*0.1
    fig, ax = plt.subplots(figsize=size, dpi=600)
    sns.set(font_scale=0.6)
    sns.heatmap(df, lw=0.1, ax=ax)
    plt.tight_layout()
    filename = "_".join([metadata["prefix"], metadata["dataname"],"original_heatmap"])+".png"
    print(filename)
    fig.savefig(os.path.join(metadata["outputdir"],filename))

plot_df_heatmap(g_df)

In [None]:
def plot_2df(df, df_transform, nrank, thredhold, title_fontsize=30, metadata=METADATA):
    """plot the original data and reconstructed data

    Args:
        df (pd.DataFrame): data
        df_transform (pd.DataFrame): reconstructed data
        nrank (int): rank
        threshold (float): 推薦値の差のしきい値.
        title_fontsize (int, optional): title font size. Defaults to 20.
        metadata (dict, optional): 可視化用データ. Defaults to METADATA.
    """
    figsize = np.array(df.shape).astype(float)[::-1]*0.2
    figsize[0] = figsize[0]*1.8
    fig, axes = plt.subplots(1, 2, figsize=figsize)
    #ax = axes[0]
    #sns.heatmap(df, ax=ax)
    ax = axes[0]
    ax.set_title("nrank={}".format(nrank), fontsize=title_fontsize)
    sns.heatmap(df_transform.values, lw=0.1, ax=ax)
    ax = axes[1]
    ax.set_title("diff".format(nrank),  fontsize=title_fontsize)
    sns.heatmap(df_transform.values-df.values>thredhold, lw=0.1, ax=ax)
    
    fig.tight_layout()
    filename = "_".join([metadata["prefix"], metadata["dataname"],"lowrank_diff"])+".png"
    print(filename)
    fig.savefig(os.path.join(metadata["outputdir"],filename))
    
    fig.show()


In [None]:
def plot_svd_sdiag(X):
    """寄与率の図示を行う．

    Args:
        X (np.array): descriptor
    """
    u, sdiag, v = np.linalg.svd(X)
    if True:
        n = sdiag.shape[0]
        s = np.zeros((u.shape[1], v.shape[0]))
        s[:n, :n] = np.diag(sdiag)
        u = np.matrix(u)
        v = np.matrix(v)  # = v.T
        s = np.matrix(s)
        usv = u*s*v
        print("check usv = original matrix? ", np.allclose(usv, X))

    sdiagsum = []
    for i in range(sdiag.shape[0]):
        sdiagsum.append(np.sum(sdiag[:i+1]))
    sdiagsum = np.array(sdiagsum)
    sdiag = sdiag / sdiagsum[-1]
    sdiagsum = sdiagsum / sdiagsum[-1]

    # 寄与率の表示
    fig, ax = plt.subplots()
    # plt.plot(np.log10(sdiag),"o-")
    ax.plot(sdiag, ".-", label="contribution")
    ax.plot(sdiagsum, ".-", label="comulative contribution")
    ax.set_ylabel("rate")
    ax.set_xlabel("index")
    ax.legend()
    fig.tight_layout()
    fig.show()


In [None]:
plot_svd_sdiag(g_df.values)

In [None]:
from sklearn.decomposition import NMF

def make_recom_svd(df, nrank):
    """line up candicates by SVD

    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    X = df.values
    u, sdiag, v = np.linalg.svd(X)
    s = np.zeros((u.shape[1], v.shape[0]))
    s[:nrank, :nrank] = np.diag(sdiag[:nrank])
    u = np.matrix(u)
    v = np.matrix(v)
    s = np.matrix(s)
    recom_svd = u * s * v
    return pd.DataFrame(recom_svd, index=df.index, columns=df.columns)


def make_recom_nmf(df, nrank):
    """line up candicates by NMF

    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    X = df.values
    model = NMF(n_components=nrank, init='random',
                shuffle=True, random_state=3)
    W = model.fit_transform(X)
    H = model.components_
    W = np.matrix(W)
    H = np.matrix(H)
    WH = W*H
    if False:
        """
        どの程度同じか調べる．
        """
        WHM = WH - X
        for i in range(WHM.shape[0]):
            for j in range(WHM.shape[1]):
                if np.abs(WHM[i, j]) > 0.1:
                    print(i, j, WHM[i, j])

    recom_nmf = WH
    return pd.DataFrame(recom_nmf, index=df.index, columns=df.columns)


def make_recom_correlation(df, nrank=None):
    """line up candicates by correlation

     X[material , structuretype]とすると
    ( X.T * X )[structuretype,structuretype] でstructuretype間の相関を与えるだろう．
    更にXをかけると[material, structuretype]の行列になる．
    recom = X[material , structuretype] * ( X.T * X )[structuretype,structuretype]
    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    # nrank はdummy
    X = np.matrix(df.values)
    """
    X[material , structuretype]とすると
    ( X.T * X )[structuretype,structuretype] でstructuretype間の相関を与えるだろう．
    更にXをかけると[material, structuretype]の行列になる．
    recom = X[material , structuretype] * ( X.T * X )[structuretype,structuretype]
    """
    recom = X * X.T * X
    # X^3のオーダーになっているので[0,1]に規格化する．
    vmax = recom.reshape(-1).max()
    vmin = recom.reshape(-1).min()
    recom = (recom - vmin)/(vmax-vmin)

    return pd.DataFrame(recom, index=df.index, columns=df.columns)


In [None]:
print("nrank=", NRANK)
g_df_recom = make_recom_svd(g_df, NRANK)
plot_2df(g_df, g_df_recom, NRANK, THRESHOLD)

In [None]:
import plotly.express as px
g_fig = px.imshow(g_df_recom-g_df)
g_fig.show()

In [None]:
def print_existence(df, df_ref, threshold=0.35):
    """print the points the value of which is more than threshold

    Args:
        df (pd.DataFrame): data
        df_ref (pd.DataFrame): reference data
        threshold (float, optional): the threshold value. Defaults to 0.3.
    """
    df_ = df
    resultlist = []
    for name1 in df_.index:
        for name2 in df_.columns:
            value = df_.loc[name1, name2]
            exist_in_ref = df_ref.loc[name1, name2]
            if value >= threshold and exist_in_ref < 1:
                resultlist.append([name1, name2, value, exist_in_ref < 1])

    dfresult = pd.DataFrame(resultlist,
                            columns=["name1", "name2", "recom-ref", "not_exist_ref"])
    return dfresult.sort_values(by="recom-ref", ascending=False)

print_existence(g_df_recom-g_df, g_df, threshold=THRESHOLD)