In [None]:
import numpy as np
import pandas as pd
#import matplotlib as plt
import networkx as nx
import itertools
from collections import Counter
#import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


def convert_categorical(df_X, _X):
    values = np.array(df_X[_X])
    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    df_X = df_X.drop(columns=_X)
    for j in range(integer_encoded.max() + 1):
        df_X.insert(loc=j + 1,
                    column=str(_X) + str(j + 1),
                    value=onehot_encoded[:, j])
    return df_X


def load_data(dataPath):
    df = pd.read_csv(dataPath + 'u.data',
                     sep='\\t',
                     engine='python',
                     names=['UID', 'MID', 'rate', 'time'])

    total = df.pivot(index = 'UID', columns = 'MID', values = 'rate')
    df = total.loc[1:754]
    df = df.unstack().reset_index(name='rate')
    df=df.dropna()
    df_test = total.loc[755:943]

    df_user = pd.read_csv(dataPath + 'u.user',
                          sep='\\|',
                          engine='python',
                          names=['UID', 'age', 'gender', 'job', 'zip'])

    df_user = convert_categorical(df_user, 'job')
    df_user = convert_categorical(df_user, 'gender')
    df_user['bin'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 100],
                            labels=['1', '2', '3', '4', '5', '6'])
    df_user['age'] = df_user['bin']

    df_user = df_user.drop(columns='bin')
    df_user = convert_categorical(df_user, 'age')
    df_user = df_user.drop(columns='zip')

    return df, df_user,df_test


def train_model(df, df_user):
    alpha_coefs = [0.01]

    for alpha_coef in alpha_coefs:
        pairs = []
        grouped = df.groupby(['MID', 'rate'])

        for key, group in grouped:
            pairs.extend(list(itertools.combinations(group['UID'], 2)))

        counter = Counter(pairs)
        alpha = alpha_coef * 1682  # param*i_no
        edge_list = map(
            list,
            Counter(el for el in counter.elements()
                    if counter[el] >= alpha).keys())
        G = nx.Graph()

        for el in edge_list:
            G.add_edge(el[0], el[1], weight=1)

        #plt.figure(figsize=(6, 6))
        #plt.figure(figsize = (15,10))
        #pos = nx.kamada_kawai_layout(G)
        #node_options = {"node_color": "black", "node_size" :30}
        #edge_options = {"width":.50, "alpha" : .5 , "edge_color" : "black"}
        #nx.draw_networkx_nodes(G, pos, **node_options)
        #nx.draw_networkx_edges(G, pos, **edge_options)
        #plt.show()

        pr = nx.pagerank(G.to_directed())
        df_user['PR'] = df_user['UID'].map(pr)
        df_user['PR'] /= float(df_user['PR'].max())
        dc = nx.degree_centrality(G)
        df_user['CD'] = df_user['UID'].map(dc)
        df_user['CD'] /= float(df_user['CD'].max())
        cc = nx.closeness_centrality(G)
        df_user['CC'] = df_user['UID'].map(cc)
        df_user['CC'] /= float(df_user['CC'].max())
        bc = nx.betweenness_centrality(G)
        df_user['CB'] = df_user['UID'].map(bc)
        df_user['CB'] /= float(df_user['CB'].max())
        lc = nx.load_centrality(G)
        df_user['LC'] = df_user['UID'].map(lc)
        df_user['LC'] /= float(df_user['LC'].max())
        nd = nx.average_neighbor_degree(G, weight='weight')
        df_user['AND'] = df_user['UID'].map(nd)
        df_user['AND'] /= float(df_user['AND'].max())
        X_train = df_user.loc[:, df_user.columns[1:]]
        X_train.fillna(0, inplace=True)
        X_train.to_pickle("data100k/x_train_alpha(" + str(alpha_coef) +
                          ").pkl")


dataPath = 'datasets/ml-100k/'

df_split, df_user,df_test = load_data(dataPath)

train_model(df_split, df_user)



In [None]:
import collections
import itertools
import math
import scipy
from scipy.spatial.distance import cdist
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Input, Dense
from keras.models import Model
import tensorflow as tf
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.cluster import KMeans
from torch.utils.data import Dataset, DataLoader
dataPath = 'data100k/'
X_train = pd.read_pickle(dataPath +
                         'x_train_alpha(0.01).pkl').values.astype(float)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoded_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 16), nn.ReLU(),
                                     nn.Linear(16, encoded_dim))
        self.decoder = nn.Sequential(nn.Linear(encoded_dim, 16), nn.ReLU(),
                                     nn.Linear(16, input_dim), nn.ReLU())

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

    def compute_l1_loss(self, w):
      return torch.abs(w).sum()

    def compute_l2_loss(self, w):
      return torch.square(w).sum()

In [None]:
# Train autoencoder
input_dim = X_train.shape[1]
encoded_dim = 4
autoencoder = Autoencoder(input_dim, encoded_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.01)
num_epochs = 100
noise_factor=0.5
num_workers = 0
traindata = torch.FloatTensor(X_train)
# prepare data loaders
train_loader = torch.utils.data.DataLoader(traindata, batch_size=10, num_workers=10)
k= []
for epoch in range(num_epochs):
  for data in train_loader:
    inputs = data
    noisy_inputs = inputs + noise_factor * torch.randn(*inputs.shape)
    # Clip the images to be between 0 and 1
    noisy_inputs = np.clip(noisy_inputs, 0., 1.)
    encoded, decoded = autoencoder(noisy_inputs)
    loss = criterion(decoded, inputs)
    l1_weight = 0.001
    l2_weight = 0.001


    parameters = []
    for parameter in autoencoder.parameters():
        parameters.append(parameter.view(-1))
    l1 = l1_weight * autoencoder.compute_l1_loss(torch.cat(parameters))
    l2 = l2_weight * autoencoder.compute_l2_loss(torch.cat(parameters))

    loss += l1
    loss += l2
    optimizer.zero_grad()
    loss.backward()


    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs,
                                                  loss.item()))
    k.append(loss.item())
print("RMSE : ", np.mean(k))



Epoch [10/100], Loss: 0.0828
Epoch [10/100], Loss: 0.0889
Epoch [10/100], Loss: 0.0780
Epoch [10/100], Loss: 0.0705
Epoch [10/100], Loss: 0.0773
Epoch [10/100], Loss: 0.0810
Epoch [10/100], Loss: 0.0794
Epoch [10/100], Loss: 0.0739
Epoch [10/100], Loss: 0.0845
Epoch [10/100], Loss: 0.0873
Epoch [10/100], Loss: 0.0687
Epoch [10/100], Loss: 0.0723
Epoch [10/100], Loss: 0.0861
Epoch [10/100], Loss: 0.0699
Epoch [10/100], Loss: 0.0808
Epoch [10/100], Loss: 0.0783
Epoch [10/100], Loss: 0.0758
Epoch [10/100], Loss: 0.0816
Epoch [10/100], Loss: 0.0772
Epoch [10/100], Loss: 0.0746
Epoch [10/100], Loss: 0.0818
Epoch [10/100], Loss: 0.0742
Epoch [10/100], Loss: 0.0820
Epoch [10/100], Loss: 0.0801
Epoch [10/100], Loss: 0.0668
Epoch [10/100], Loss: 0.0771
Epoch [10/100], Loss: 0.0841
Epoch [10/100], Loss: 0.0965
Epoch [10/100], Loss: 0.0688
Epoch [10/100], Loss: 0.0873
Epoch [10/100], Loss: 0.0794
Epoch [10/100], Loss: 0.0805
Epoch [10/100], Loss: 0.0778
Epoch [10/100], Loss: 0.0766
Epoch [10/100]

In [None]:
# PATH = 'models/autoencoder.pt'
# torch.save(autoencoder.encoder, PATH)
PATH_BEST = 'models/autoencoder_best.pt'
torch.save(autoencoder.encoder, PATH_BEST)
encoder = torch.load('models/autoencoder_best.pt')
with torch.no_grad():
    encoded_features = encoder(torch.from_numpy(X_train).float())
    # Save encoded features to file
encoded_features_df = pd.DataFrame(encoded_features)
encoded_features_df.to_pickle('data/encoded_features/encoded_features.pkl')
encoded_features_df = StandardScaler().fit_transform(encoded_features_df)

In [None]:
encoded_features_df

array([[-0.4810396 ,  0.9807042 , -1.9925573 ,  1.3439918 ],
       [ 0.9015237 , -1.527919  ,  1.8225121 , -2.3130918 ],
       [-0.6383988 ,  1.2295055 , -0.9652076 ,  0.39640757],
       ...,
       [ 0.40760502, -0.8510854 ,  0.14607401,  0.17292108],
       [ 0.23353305, -1.1310993 ,  1.1201208 , -0.883825  ],
       [ 0.8983033 ,  0.184703  , -0.4813377 ,  0.7825378 ]],
      dtype=float32)

In [None]:
# Cluster users using encoded features
kmeans = KMeans(n_clusters=7, n_init='auto', init='random', random_state=0).fit(encoded_features_df)
cluster_labels = kmeans.labels_
kmeans.inertia_

1040.811279296875

In [None]:
# Print cluster sizes
for i in range(7):
    print(f"Cluster {i}: {np.sum(cluster_labels == i)} users")

Cluster 0: 114 users
Cluster 1: 160 users
Cluster 2: 64 users
Cluster 3: 170 users
Cluster 4: 126 users
Cluster 5: 159 users
Cluster 6: 150 users


In [None]:
# Evaluate the clustering performance using a clustering metric
from sklearn.metrics import silhouette_score

silhouette = silhouette_score(encoded_features, cluster_labels, metric='euclidean')
print("Silhouette Score: {:.2f}".format(silhouette))

Silhouette Score: 0.27


In [None]:
X_encoded = pd.read_pickle("data/encoded_features/encoded_features.pkl")
encoded_features_df = StandardScaler().fit_transform(encoded_features_df)
# X_encoded = RobustScaler().fit_transform(X_encoded)

In [None]:
wcss = []
K = range(1, 51)
for k in K:
    km = KMeans(n_clusters=k, n_init=10)
    km = km.fit(X_encoded)
    wcss.append(km.inertia_)

distances = []
for i in range(1, 50):
    # distances.append(p.distance_to_line(p1, p2))
    ch = abs((wcss[49] - wcss[0]) * i - (50 - 1) * wcss[i - 1] + (50 * wcss[0] - 1 * wcss[49]))
    dis = math.sqrt(math.pow(50 - 1, 2) + math.pow(wcss[49] - wcss[0], 2))
    distances.append(ch / dis)

n_clusters_ = np.argmax(distances) + 1
n_clusters_

8

In [None]:
n_clusters = 7
kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10).fit_predict(encoded_features)
kmeans

array([3, 2, 3, 4, 0, 3, 0, 6, 4, 0, 0, 5, 0, 1, 0, 1, 5, 0, 6, 0, 3, 3,
       3, 3, 4, 5, 2, 4, 0, 6, 3, 5, 4, 6, 0, 6, 4, 5, 1, 6, 4, 3, 1, 3,
       5, 5, 2, 5, 3, 3, 1, 5, 5, 3, 4, 1, 2, 3, 0, 5, 4, 3, 6, 6, 2, 4,
       6, 6, 3, 3, 6, 3, 4, 6, 1, 6, 4, 1, 6, 6, 4, 5, 0, 4, 2, 1, 5, 2,
       5, 2, 2, 4, 5, 4, 4, 3, 5, 5, 6, 4, 6, 5, 4, 4, 3, 2, 6, 0, 5, 6,
       0, 6, 5, 5, 4, 4, 6, 3, 5, 0, 2, 5, 5, 4, 1, 1, 6, 3, 6, 2, 2, 0,
       0, 4, 4, 2, 0, 0, 6, 5, 0, 2, 0, 0, 4, 5, 2, 4, 6, 0, 5, 2, 4, 4,
       2, 1, 2, 0, 3, 5, 6, 3, 0, 5, 2, 0, 2, 0, 2, 2, 0, 2, 2, 3, 1, 1,
       0, 5, 1, 3, 3, 4, 6, 6, 2, 4, 1, 6, 4, 3, 6, 0, 4, 4, 1, 5, 2, 3,
       3, 5, 3, 0, 5, 2, 6, 6, 4, 5, 6, 4, 2, 0, 4, 1, 5, 3, 0, 6, 4, 1,
       6, 3, 5, 2, 2, 4, 5, 5, 1, 3, 2, 1, 4, 0, 6, 3, 5, 5, 4, 5, 5, 6,
       6, 3, 4, 6, 3, 4, 4, 3, 4, 5, 1, 0, 1, 6, 6, 6, 4, 0, 1, 5, 5, 5,
       3, 2, 3, 3, 2, 5, 0, 6, 0, 5, 4, 3, 6, 2, 5, 1, 6, 1, 5, 4, 5, 4,
       5, 6, 2, 4, 6, 5, 3, 4, 6, 3, 3, 3, 4, 5, 4,

In [None]:
# Cluster = ['cluster0', 'cluster1', 'cluster2', 'cluster3', 'cluster4', 'cluster5', 'cluster6', 'cluster7', 'cluster8', 'cluster9', 'cluster10']
print(len(np.unique(kmeans)))
Cluster = ['cluster0', 'cluster1', 'cluster2', 'cluster3', 'cluster4', 'cluster5', 'cluster6']
df1 = pd.DataFrame(np.zeros((943, len(np.unique(kmeans)))), columns=Cluster)
df1.index = df1.index + 1
df1

7


Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
UID = 1
for i in kmeans:
  df1.loc[UID][i] = 1
  UID = UID + 1

In [None]:
df1

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,1.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,1.0
941,0.0,0.0,0.0,0.0,0.0,0.0,1.0
942,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
df_read = df_split.pivot(index = 'UID', columns = 'MID', values = 'rate')
df = pd.DataFrame(0, columns=list(range(1,1683)), index=list(range(1,944)))
df = df.combine(df_read, np.maximum)
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [None]:
df2 = pd.DataFrame(index = Cluster, columns = df.columns)
df2

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
cluster0,,,,,,,,,,,...,,,,,,,,,,
cluster1,,,,,,,,,,,...,,,,,,,,,,
cluster2,,,,,,,,,,,...,,,,,,,,,,
cluster3,,,,,,,,,,,...,,,,,,,,,,
cluster4,,,,,,,,,,,...,,,,,,,,,,
cluster5,,,,,,,,,,,...,,,,,,,,,,
cluster6,,,,,,,,,,,...,,,,,,,,,,


In [None]:
def find_similar_movie(cluster_rate, mid, df_item):
    similar_Movies = []
    np_item = np.array(df_item[df_item.columns[5:]])
    genres = np_item[mid-1]
    for Movies in cluster_rate.index:
        comp_genres = np_item[Movies - 1]
        if np.array_equal(genres, comp_genres) and Movies != mid:
            similar_Movies.append(Movies)

    return similar_Movies
def check_user_exist(Movies, users, df):
  if df.loc[users][S_M].isnull().all().all() == False:
    return True
  else:
    return False

In [None]:
dataPath = 'datasets/ml-100k/'
df_item = pd.read_csv(dataPath + 'u.item', sep='\\|', engine='python',
                      names=['MID', 'title', 'rdate', 'vdate', 'URL', 'unknown', 'Action', 'Adventure', 'Animation',
                              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
                              'Western'], encoding='latin-1')
for Movie in df2.columns:
  for cluster in Cluster:
    user = df1.index[df1[cluster] == 1]
    if df[Movie].iloc[user - 1].isnull().all().all() == False:
        df2.loc[cluster][Movie] = np.nanmean(df.loc[user][Movie])
        continue

    S_M = find_similar_movie(df2.loc[cluster], Movie, df_item)
    if check_user_exist(S_M, user, df) == True:
        df2.loc[cluster][Movie] = np.nanmean(df.loc[user][S_M])
    else:
        df2.loc[cluster][Movie] = np.nanmean(df.loc[user])

In [None]:
# np.dot(df1, df2)
prediction = df1.dot(df2)
# df2.dot(df1)
prediction

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,4.026316,2.925926,3.428571,3.638889,3.176471,2.666667,3.943662,3.916667,3.916667,3.619048,...,3.357945,3.476718,3.476718,3.476718,3.476718,3.476718,3.333333,3.582908,3.079374,3.476718
2,3.88,3.5,3.5,3.0,3.5,4.25,3.470588,3.9375,4.321429,4.083333,...,3.495098,3.731511,3.731511,3.731511,3.731511,3.731511,3.785714,3.748111,3.480645,3.731511
3,4.026316,2.925926,3.428571,3.638889,3.176471,2.666667,3.943662,3.916667,3.916667,3.619048,...,3.357945,3.476718,3.476718,3.476718,3.476718,3.476718,3.333333,3.582908,3.079374,3.476718
4,4.0,2.954545,2.454545,3.62069,3.416667,5.0,3.606557,4.034483,3.90625,3.909091,...,3.367619,3.729849,3.729849,3.729849,3.729849,3.729849,3.692308,3.622061,3.273204,3.729849
5,3.54902,3.0,2.714286,3.625,3.0,5.0,3.5,4.166667,3.794118,3.769231,...,3.489305,3.80012,3.80012,3.80012,3.80012,3.80012,3.928571,3.701068,3.280039,3.80012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,3.96875,3.55,3.0,3.46875,3.230769,3.333333,3.8125,3.885714,3.672727,4.142857,...,3.398077,3.656818,3.656818,3.656818,3.656818,3.656818,3.636364,3.598734,3.319563,3.656818
940,3.637931,3.444444,3.352941,3.53125,3.307692,3.666667,3.818182,3.923077,3.935484,4.0,...,3.480851,3.754307,3.754307,3.754307,3.754307,3.754307,3.75,3.683301,3.292576,3.754307
941,3.637931,3.444444,3.352941,3.53125,3.307692,3.666667,3.818182,3.923077,3.935484,4.0,...,3.480851,3.754307,3.754307,3.754307,3.754307,3.754307,3.75,3.683301,3.292576,3.754307
942,3.88,3.5,3.5,3.0,3.5,4.25,3.470588,3.9375,4.321429,4.083333,...,3.495098,3.731511,3.731511,3.731511,3.731511,3.731511,3.785714,3.748111,3.480645,3.731511


In [None]:
RMSE = np.sqrt(np.nanmean(np.square(df_test - prediction)))
RMSE

1.0587807320006368

In [None]:
MAE = (np.nanmean(abs(df_test - prediction)))
MAE

0.8451033321730327