In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
LOWRANK_APPROX = "nmf" # svd or nmf

In [None]:
METADATA = {"outputdir": "image_executed", "prefix": "recommenderSystem", 
              "dataname":"example", "lowrankapprox": LOWRANK_APPROX}

In [None]:
# 教材のため乱数を固定する．
np.random.seed(7)


def make_data(n1=10, n2=20):
    """make data

    Args:
        n1 (int, optional): size 1. Defaults to 10.
        n2 (int, optional): size 2. Defaults to 20.

    Returns:
        pd.DataFrame: data
    """
    X0 = np.zeros((n1, n2))

    for i in range(15):
        i1, i2 = np.random.randint(n1), np.random.randint(n2)
        X0[i1, i2] = 1

    X0[1, :] = 1
    X0[1, 4] = 0

    X0[2, :] = 1
    X0[2, 4:15] = 0

    return pd.DataFrame(X0)

if False:
    g_df = make_data()
    g_df.to_csv("data/example.csv", index=False)
else:
    ROOT = ".."
    g_df = pd.read_csv(f"{ROOT}/data/recommend/example.csv", index_col=[0])
sns.heatmap(g_df.values)

In [None]:
def plot_svd_sdiag(X):
    """寄与率の表示

    Args:
        X (np.array): descriptor
    """
    u, sdiag, v = np.linalg.svd(X)
    if True:
        n = sdiag.shape[0]
        s = np.zeros((u.shape[1], v.shape[0]))
        s[:n, :n] = np.diag(sdiag)
        u = np.matrix(u)
        v = np.matrix(v)  # = v.T
        s = np.matrix(s)
        usv = u*s*v
        """
        check matrixes are almost the same with np.allcose() 
        """
        print("check usv = original matrix? ", np.allclose(usv, X))

    sdiagsum = []
    for i in range(sdiag.shape[0]):
        sdiagsum.append(np.sum(sdiag[:i+1]))
    sdiagsum = np.array(sdiagsum)
    sdiag = sdiag / sdiagsum[-1]
    sdiagsum = sdiagsum / sdiagsum[-1]

    # 寄与率の表示
    fig, ax = plt.subplots()
    # plt.plot(np.log10(sdiag),"o-")
    ax.plot(sdiag, ".-", label="contribution")
    ax.plot(sdiagsum, ".-", label="comulative contribution")
    ax.set_ylabel("ratte")
    ax.set_xlabel("index")
    ax.legend()
    fig.show()

plot_svd_sdiag(g_df.values)

In [None]:
from sklearn.decomposition import NMF

def make_recom_svd(df, nrank):
    """line up candicates by SVD

    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    X = df.values
    u, sdiag, v = np.linalg.svd(X)
    s = np.zeros((u.shape[1], v.shape[0]))
    s[:nrank, :nrank] = np.diag(sdiag[:nrank])
    u = np.matrix(u)
    v = np.matrix(v)
    s = np.matrix(s)
    recom_svd = u * s * v
    return pd.DataFrame(recom_svd, index=df.index, columns=df.columns)


def make_recom_nmf(df, nrank, random_state=1):
    """line up candicates by NMF

    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    X = df.values
    model = NMF(n_components=nrank, init='random', random_state=random_state)
    W = model.fit_transform(X)
    H = model.components_
    W = np.matrix(W)
    H = np.matrix(H)
    WH = W*H
    """
    check how they are the same
    """
    if False:
        WHM = WH - X
        for i in range(WHM.shape[0]):
            for j in range(WHM.shape[1]):
                if np.abs(WHM[i, j]) > 0.1:
                    print(i, j, WHM[i, j])

    recom_nmf = WH
    return pd.DataFrame(recom_nmf, index=df.index, columns=df.columns)


def make_recom_correlation(df, nrank=None):
    """line up candicates by correlation

     X[material , structuretype]とすると
    ( X.T * X )[structuretype,structuretype] でstructuretype間の相関を与えるだろう．
    更にXをかけると[material, structuretype]の行列になる．
    recom = X[material , structuretype] * ( X.T * X )[structuretype,structuretype]
    
    Args:
        df (pd.DataFrame): data
        nrank (int): the maximum rank to reconstruct data

    Returns:
        pd.DataFrame: reconstruct data
    """
    # nrank はdummy
    X = np.matrix(df.values)

    recom = X * X.T * X
    # X^3のオーダーになっているので[0,1]に規格化する．
    vmax = recom.reshape(-1).max()
    vmin = recom.reshape(-1).min()
    recom = (recom - vmin)/(vmax-vmin)

    return pd.DataFrame(recom, index=df.index, columns=df.columns)

In [None]:
import os

def plot_2df(df_orig, df_recom, nrank,threshold=0.2, metadata=METADATA):
    """plot the original data and reconstructed data

    Args:
        df_orig (pd.DataFrame): data
        df_recom (pd.DataFrame): reconstructed data
        nrank (int): rank.
        threshold (float): 差分のしきい値. Defaults to 0.1.
        metadata (dict): 可視化用データ. Defaults to METADATA.
    """
    fig, axes = plt.subplots(1,3, figsize=(9,3))
    ax = axes[0]
    ax.yaxis.set_visible(False)
    ax.xaxis.set_visible(False)
    sns.heatmap(df_orig.values, ax=ax)
    ax.set_title("original")
    ax = axes[1]
    ax.yaxis.set_visible(False)
    ax.xaxis.set_visible(False)
    sns.heatmap(df_recom.values, ax=ax)
    ax.set_title("low rank approx.")
    ax = axes[2]
    ax.yaxis.set_visible(False)
    ax.xaxis.set_visible(False)
    diff = df_recom.values-df_orig.values > threshold
    sns.heatmap(diff, ax=ax)
    ax.set_title("difference")
    fig.tight_layout()
    filename = "_".join([metadata["prefix"], metadata["dataname"],
                          metadata["lowrankapprox"],
                         "nrank"+str(nrank), "plot_2d"])+".png"
    print(filename)
    fig.savefig(os.path.join(metadata["outputdir"], filename))


In [None]:
for _nrank in [1, 3, 5, 7]:
    if LOWRANK_APPROX== "svd":
        g_df_tranform = make_recom_svd(g_df, _nrank)
    elif LOWRANK_APPROX == "nmf":
        g_df_tranform = make_recom_nmf(g_df,_nrank)
    plot_2df(g_df, g_df_tranform, _nrank)