# CUHK STAT3009 Recommender Systems Project 2

## Import datasets and loss metric

In [None]:
import numpy as np
import pandas as pd

dtrain = pd.read_csv('train.csv', usecols=['userID', 'itemID', 'genre', 'rating', 'date'])
dtest = pd.read_csv('test.csv', usecols=['userID', 'itemID', 'genre', 'date'])
dnetwork = pd.read_csv('user_social_net.csv', usecols=['from', 'to'])

def rmse(true, pred):
	return np.sqrt(np.mean((pred - true)**2))

### Overview of the dataset

In [None]:
dtrain.sample(5)

In [None]:
dtrain.info()

In [None]:
dtest.sample(5)

In [None]:
dtest.info()

In [None]:
dnetwork.sample(5)

In [None]:
dnetwork.info()

### Network dataset

In [None]:
print("No. of unique users only exist in social network or in train dataset:", 
      len(list(set(dnetwork["from"]).union(dnetwork["to"]).symmetric_difference(set(dtrain["userID"])))))
print("No. of users follow themselves: ", len(dnetwork[dnetwork['from'] == dnetwork['to']]))

In [None]:
print('No. of unique users in train dataset following others: ', 
      len(pd.unique(dtrain['userID'][dtrain["userID"].isin(dnetwork["from"])])))
print('No. of unique users in train dataset followed by others: ', 
      len(pd.unique(dtrain['userID'][dtrain["userID"].isin(dnetwork["to"])])))

### Genre

In [None]:
print("Total No. of unique genre in train dataset: ", len(dtrain["genre"].unique()))

print("Same unique genre in both datasets: ", 
      (set(dtrain["genre"]) == set(dtest["genre"].unique())))

In [None]:
print("The genre are ", np.sort(dtrain["genre"].unique()))

In [None]:
for i in np.sort(dtrain["genre"].unique()):
    num = len(dtrain["itemID"][dtrain['genre'] == i])
    print(f"For {i}, there are {num} items in train dataset")
    
    num = len(dtest["itemID"][dtest['genre'] == i])
    print(f"For {i}, there are {num} items in test dataset\n")

### Date

In [None]:
print("Range of dates in train data: " + min(dtrain['date']) + " - " + max(dtrain["date"]))
print("Range of dates in train data: " + min(dtest['date']) + " - " + max(dtest["date"]))

In [None]:
print("Range of rating: %f - %f" %(min(dtrain['rating']), max(dtrain['rating'])))

## Data Cleansing

### Convert 'date' from object to datatime object

In [None]:
dtrain['date'] = pd.to_datetime(dtrain['date'], format = '%Y-%m')
dtest['date'] = pd.to_datetime(dtest['date'], format = '%Y-%m')

In [None]:
dtrain["month"] = dtrain['date'].dt.month.astype('category')
dtest["month"] = dtest['date'].dt.month.astype('category')

### Encoding for all features

#### Label Encoding

Label (Ordinal) encoding is the simplest way to encode the categorical data. It ensure 1-to-1 mapping but it makes the data linearly dependent with each other. All the `userID` in train dataset, test dataset and network dataset are converted into ordinal data. `itemID`, `genre` and `month` in both train and test dataset are also converted.

In [None]:
cat_cols = ['userID', 'itemID', 'genre', "month"]
from sklearn.preprocessing import LabelEncoder


for col in cat_cols:
    encoder = LabelEncoder()
    if col == "userID":
        encoder.fit(np.concatenate([dtrain[col], dtest[col], dnetwork["from"], dnetwork["to"]]))
        dnetwork["from_ordinal"] = encoder.transform(dnetwork["from"])
        dnetwork["to_ordinal"] = encoder.transform(dnetwork["to"])
    else:
        encoder.fit(np.concatenate([dtrain[col], dtest[col]]))
    dtrain[f"{col}_ordinal"] = encoder.transform(dtrain[f"{col}"])
    dtest[f"{col}_ordinal"] = encoder.transform(dtest[f"{col}"])

In [None]:
dnetwork.sample(5)

## Feature Engineering

### Frequency encoding

Frequency encoding labels the data by their frequencies in train dataset. The frequencies of each features are obtained by using this method and reduced the chance of having linear dependency comparing with label encoding. All data in both train and test dataset are also converted but not for network dataset due to the lack of injectivity.

In [None]:
from category_encoders import CountEncoder

cat_cols = ['userID', 'itemID', 'genre', 'month']
for col in cat_cols:
    dtrain[f"{col}_freq"] = 0
    dtest[f"{col}_freq"] = 0
    for i in dtrain[f"{col}_ordinal"].unique():
        freq = len(dtrain[dtrain[f"{col}_ordinal"] == i]) + len(dtest[dtest[f"{col}_ordinal"] == i])
        dtrain[f"{col}_freq"][dtrain[f"{col}_ordinal"] == i] = freq
        dtest[f"{col}_freq"][dtest[f"{col}_ordinal"] == i] = freq

In [None]:
dtrain.sample(5)

### Mean Encoding on all features

This is also known as target encoding which labels the data by their mean in train dataset. The mean of each features are obtained by using this method and reduced the chance of having linear dependency compare with label encoding and frequency encodingJamesSteinEncoder.

In [None]:
from category_encoders import TargetEncoder

cat_cols = ['userID', 'itemID', 'genre', 'month']
for col in cat_cols:
    encoder = TargetEncoder()
    dtrain[f'{col}_mean'] = encoder.fit_transform(dtrain[f"{col}"], dtrain['rating'])
    dtest[f'{col}_mean'] = encoder.transform(dtest[f"{col}"])

### James-Stein Encoding

This method labels the data by their James-Stein estimator in train dataset.

In [None]:
from category_encoders import JamesSteinEncoder

cat_cols = ['userID', 'itemID', 'genre', 'month']
for col in cat_cols:
    encoder = JamesSteinEncoder()
    dtrain[f'{col}_JS'] = encoder.fit_transform(dtrain[f"{col}"], dtrain['rating'])
    dtest[f'{col}_JS'] = encoder.transform(dtest[f"{col}"])

In [None]:
dtrain.sample(5).T

In [None]:
dtest.sample(5).T

### Store the number of user, item, genre and month 

In [None]:
n_user = int(max(max(dtrain["userID_ordinal"]), max(dtest["userID_ordinal"]))) + 1
n_item = int(max(max(dtrain["itemID_ordinal"]), max(dtest["itemID_ordinal"]))) + 1
n_genre = int(max(max(dtrain["genre_ordinal"]), max(dtest["genre_ordinal"]))) + 1
n_month = 12

### Create mappings for user A following user B / user A followed by user B

In [None]:
following = []
followedby = []
for u in range(0, n_user):
    following.append(np.array(dnetwork["to_ordinal"][dnetwork["from_ordinal"] == u]))
    followedby.append(np.array(dnetwork["from_ordinal"][dnetwork["to_ordinal"] == u]))

### Following/Follower numbers of each user

In [None]:
dtrain["following_no"] = 0
dtrain["followedby_no"] = 0
for u in set(dtrain["userID_ordinal"]):
    dtrain["following_no"][dtrain["userID_ordinal"] == u] = len(following[u])
    dtrain["followedby_no"][dtrain["userID_ordinal"] == u] = len(followedby[u])

dtest["following_no"] = 0
dtest["followedby_no"] = 0
for u in set(dtest["userID_ordinal"]):
    dtest["following_no"][dtest["userID_ordinal"] == u] = len(following[u])
    dtest["followedby_no"][dtest["userID_ordinal"] == u] = len(followedby[u])

In [None]:
dtrain[dtrain["following_no"] != 0].sample(5).T

In [None]:
dtest[dtest["followedby_no"] != 0].sample(5).T

### Following Mean and Follower Mean

In [None]:
dtrain["following_mean"] = 0
dtrain["follower_mean"] = 0

for u in set(dtrain["userID_ordinal"]):
    u_tmp = following[u]
    if len(u_tmp) == 0:
        continue
    dtrain["following_mean"][dtrain["userID_ordinal"] == u] = dtrain["rating"][dtrain["userID_ordinal"].isin(u_tmp)].mean()

for u in set(dtrain["userID_ordinal"]):
    u_tmp = followedby[u]
    if len(u_tmp) == 0:
        continue
    dtrain["follower_mean"][dtrain["userID_ordinal"] == u] = dtrain["rating"][dtrain["userID_ordinal"].isin(u_tmp)].mean()


In [None]:
dtrain.info()

In [None]:
dtrain[dtrain["following_mean"] != 0].sample(5).T

In [None]:
dtrain[dtrain["follower_mean"] != 0].sample(5).T

In [None]:
dtest["following_mean"] = 0
dtest["follower_mean"] = 0

for u in set(dtest["userID_ordinal"]):
    u_tmp = following[u]
    if len(u_tmp) == 0:
        continue
    dtest["following_mean"][dtest["userID_ordinal"] == u] = dtrain["rating"][dtrain["userID_ordinal"].isin(u_tmp)].mean()

for u in set(dtest["userID_ordinal"]):
    u_tmp = followedby[u]
    if len(u_tmp) == 0:
        continue
    dtest["follower_mean"][dtest["userID_ordinal"] == u] = dtrain["rating"][dtrain["userID_ordinal"].isin(u_tmp)].mean()


In [None]:
dtest[dtest["following_mean"] != 0].sample(5).T

In [None]:
dtest[dtest["follower_mean"] != 0].sample(5).T

### Networkx for handling graph

It is a strong assumption that the rating to each item given by the users is highly influenced by their friends (following users/ followers). It is decided to consider the users as nodes and consider the connection to their respective following users/ followers as edges. 

In [None]:
dnetwork = dnetwork.drop(labels=["from", "to"], axis=1)
dnetwork

#### Undirected assumption (i.e. A follows B $\Leftrightarrow$ A is followed by B $\Leftrightarrow$ edge $A \leftrightarrow B$)

In [None]:
G = {}

for u in range(n_user):
    G[u] = set(np.append(following[u], followedby[u]))

In [None]:
import networkx as nx
G = nx.Graph(G)

In [None]:
nx.draw(G, node_size=10)

In [None]:
import matplotlib.pyplot as plt

def plot_degree_dist(G):
    degrees = [G.degree(n) for n in G.nodes()]
    plt.hist(degrees, bins=100)
    plt.show()

plot_degree_dist(G)

#### Directed assumption (i.e. A follows B $\Leftrightarrow$ edge $A \leftrightarrow B$ $\not\Leftrightarrow$ A is followed by B )

In [None]:
H = {}

for u in range(n_user):
    H[u] = set(following[u])

In [None]:
H = nx.DiGraph(H)

In [None]:
nx.draw(H, node_size=10)

In [None]:
plot_degree_dist(H)

#### Calculation indicators from the theory of complex graphs

The centrality of proximity: This indicator makes it possible to detect the individuals who have a significant power on the transfer of information. Individuals with a large centralized proximity have the ability to contact a very large number of individuals easily

The betweeness centrality: This indicator can detect individuals who influence the transfer of information. If these individuals do not exist in the network, then the information can not flow on both sides of the network.

The eigenvector centrality: The individuals having a high spectral centralized are the individuals who have the most relation in the network, they are central and have influence in a general way on the network.

In [None]:
# Degree
dtrain['undirected_degree'] = 0
dtest['undirected_degree'] = 0
dtrain['directed_degree'] = 0
dtest['directed_degree'] = 0

for u in range(n_user):
    # Undirected
    dtrain['undirected_degree'][dtrain['userID_ordinal'] == u] = G.degree(u)
    dtest['undirected_degree'][dtest['userID_ordinal'] == u] = G.degree(u)
    # Directed
    dtrain['directed_degree'][dtrain['userID_ordinal'] == u] = H.degree(u)
    dtest['directed_degree'][dtest['userID_ordinal'] == u] = H.degree(u)

In [None]:
# Betweenness Centrality

# Undirected
bet_cen1 = nx.betweenness_centrality(G)
df_bet_cen1 = pd.DataFrame.from_dict(bet_cen1, orient='index')
df_bet_cen1.columns = ['undirected_betweenness_centrality']
df_bet_cen1.index.names = ['userID_ordinal']
df_bet_cen1.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_bet_cen1, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_bet_cen1, on = ['userID_ordinal'])

# Directed
bet_cen2 = nx.betweenness_centrality(H)
df_bet_cen2 = pd.DataFrame.from_dict(bet_cen2, orient='index')
df_bet_cen2.columns = ['directed_betweenness_centrality']
df_bet_cen2.index.names = ['userID_ordinal']
df_bet_cen2.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_bet_cen2, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_bet_cen2, on = ['userID_ordinal'])

In [None]:
# Clustering coefficient

# Undirected
clust_coeff1 = nx.clustering(G)
df_clust1 = pd.DataFrame.from_dict(clust_coeff1, orient='index')
df_clust1.columns = ['undirected_clust_coefficient']
df_clust1.index.names = ['userID_ordinal']
df_clust1.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_clust1, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_clust1, on = ['userID_ordinal'])

# Directed
clust_coeff2 = nx.clustering(H)
df_clust2 = pd.DataFrame.from_dict(clust_coeff2, orient='index')
df_clust2.columns = ['directed_clust_coefficient']
df_clust2.index.names = ['userID_ordinal']
df_clust2.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_clust2, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_clust2, on = ['userID_ordinal'])

In [None]:
# Closeness centrality

# Undirected
clo_cen1 = nx.closeness_centrality(G)
df_clo1 = pd.DataFrame.from_dict(clo_cen1, orient='index')
df_clo1.columns = ['undirected_closeness_centrality']
df_clo1.index.names = ['userID_ordinal']
df_clo1.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_clo1, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_clo1, on = ['userID_ordinal'])

# Directed
clo_cen2 = nx.closeness_centrality(H)
df_clo2 = pd.DataFrame.from_dict(clo_cen2, orient='index')
df_clo2.columns = ['directed_closeness_centrality']
df_clo2.index.names = ['userID_ordinal']
df_clo2.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_clo2, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_clo2, on = ['userID_ordinal'])

In [None]:
# Eigenvector centrality

# Undirected
eig_cen1 = nx.eigenvector_centrality_numpy(G)
df_eig1 = pd.DataFrame.from_dict(eig_cen1, orient='index')
df_eig1.columns = ['undirected_eigenvector_centrality']
df_eig1.index.names = ['userID_ordinal']
df_eig1.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_eig1, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_eig1, on = ['userID_ordinal'])

# Directed
eig_cen2 = nx.eigenvector_centrality_numpy(H)
df_eig2 = pd.DataFrame.from_dict(eig_cen2, orient='index')
df_eig2.columns = ['directed_eigenvector_centrality']
df_eig2.index.names = ['userID_ordinal']
df_eig2.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_eig2, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_eig2, on = ['userID_ordinal'])

In [None]:
# Degree centrality

# Undirected
deg_cen1 = nx.degree_centrality(G)
df_degcen1 = pd.DataFrame.from_dict(deg_cen1, orient='index')
df_degcen1.columns = ['undirected_degree_centrality']
df_degcen1.index.names = ['userID_ordinal']
df_degcen1.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_degcen1, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_degcen1, on = ['userID_ordinal'])

# Directed
deg_cen2 = nx.degree_centrality(H)
df_degcen2 = pd.DataFrame.from_dict(deg_cen2, orient='index')
df_degcen2.columns = ['directed_degree_centrality']
df_degcen2.index.names = ['userID_ordinal']
df_degcen2.reset_index(level=0, inplace=True)
dtrain = pd.merge(dtrain, df_degcen2, on = ['userID_ordinal'])
dtest = pd.merge(dtest, df_degcen2, on = ['userID_ordinal'])

In [None]:
dtrain[dtrain['follower_mean'] != 0].sample(5).T

In [None]:
dtest[dtest['following_mean'] != 0].sample(5).T

## Update the dataset

In [None]:
dtrain["following_mean"].fillna(0, inplace = True)
dtest["following_mean"].fillna(0, inplace = True)
dtrain["follower_mean"].fillna(0, inplace = True)
dtest["follower_mean"].fillna(0, inplace = True)

In [None]:
dtrain = dtrain.drop(labels=['userID', 'itemID', 'genre', "month"],axis=1)

In [None]:
dtest = dtest.drop(labels=['userID', 'itemID', 'genre', "month"], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()  
df = dtrain[['undirected_degree', 'directed_degree', 
        'undirected_betweenness_centrality', 'directed_betweenness_centrality', 
        'undirected_clust_coefficient', 'directed_clust_coefficient', 
        'undirected_closeness_centrality', 'directed_closeness_centrality', 
        'undirected_eigenvector_centrality', 'directed_eigenvector_centrality', 
        'undirected_degree_centrality', 'directed_degree_centrality']] 

dtrain[['undirected_degree', 'directed_degree', 'undirected_betweenness_centrality', 'directed_betweenness_centrality', 'undirected_clust_coefficient', 'directed_clust_coefficient', 'undirected_closeness_centrality', 'directed_closeness_centrality', 'undirected_eigenvector_centrality', 'directed_eigenvector_centrality', 'undirected_degree_centrality', 'directed_degree_centrality']]= scaler.fit_transform(df)

In [None]:
dtrain[['undirected_degree', 'directed_degree', 
        'undirected_betweenness_centrality', 'directed_betweenness_centrality', 
        'undirected_clust_coefficient', 'directed_clust_coefficient', 
        'undirected_closeness_centrality', 'directed_closeness_centrality', 
        'undirected_eigenvector_centrality', 'directed_eigenvector_centrality', 
        'undirected_degree_centrality', 'directed_degree_centrality']].sample(3).T

In [None]:
train_rating = np.array(dtrain["rating"].values)

## EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 15)
sns.set_theme(style="ticks")

sns.histplot(data=dtrain, x='rating', bins=len(dtrain['rating'].unique()))
plt.title("Rating")
plt.grid()
plt.show()

In [None]:
print("No. of rating equals 6: ", len(dtrain['rating'][dtrain['rating'] == 6]))

The higher ratings have higher frequencies except the fact that there are only 43 ratings equal to 6.

In [None]:
dtrain[dtrain["userID_ordinal"].isin(dtrain["userID_ordinal"][dtrain['rating'] == 6])].sample(5).T

In [None]:
all6users = []
for i in dtrain["userID_ordinal"]:
    flag = np.unique(dtrain["rating"][dtrain["userID_ordinal"] == i])
    if list(set(flag))[0] == 6:
        all6users.append(i)
dtrain[dtrain["userID_ordinal"].isin(all6users)]

Aftering inspecting the users given rating 6, 3 of them only rated once and gave rating 6.

In [None]:
all1users = []
for i in dtrain["userID_ordinal"]:
    flag = np.unique(dtrain["rating"][dtrain["userID_ordinal"] == i])
    if list(set(flag))[0] == 1:
        all6users.append(i)
dtrain[dtrain["userID_ordinal"].isin(all1users)]

There is no user only gives rating 1.

### Histogram for each feature means

In [None]:
figure, axis = plt.subplots(2, 2)

cat_cols = np.array(['userID', 'itemID', 'genre', 'month']).reshape(2,2)
for i in range(0, 2):
    for j in range(0, 2):
        col = cat_cols[j, i]
        sns.histplot(dtrain.groupby(f'{col}_ordinal')['rating'].mean(), ax = axis[j,i])
        axis[j,i].set_title(f"{col} Mean")
        axis[j,i].grid()

### Lineplot for each months mean

In [None]:
sns.lineplot(x=dtrain.groupby('date')['rating'].mean().keys(),
                y=dtrain.groupby('date')['rating'].mean())
plt.title("Monthly Mean (2012-03 - 2013-06)")
plt.grid()
plt.show()

It is unlikely that the ratings on each months in a year follows a particular cycle. Therefore, dropping `date` from datasets is sensible.

In [None]:
dtrain = dtrain.drop("date", axis=1)
dtest = dtest.drop("date", axis=1)

### Barplot and Violinplot for monthly mean

In [None]:
figure, axis = plt.subplots(ncols = 2)

sns.barplot(x=dtrain['month_ordinal'],
                y=dtrain['rating'], ax=axis[0])
axis[0].grid()

sns.violinplot(x=dtrain["month_ordinal"],
                y=dtrain['rating'], ax=axis[1])
axis[1].grid()

Each month mean rating is moderate around 4 where month 9 is the highest and month 8 is the lowest. The violinplot shows that the ratings of every months in a year has similar distributions.

### Rated Frequencies for each month

In [None]:
sns.histplot(data=dtrain, x='month_ordinal')
plt.grid()
plt.show()

In [None]:
print("No. of rating for month 7: ", len(dtrain['month_ordinal'][dtrain['month_ordinal'] == 4]))

In [None]:
print("No. of rating for month 7: ", len(dtrain['month_ordinal'][dtrain['month_ordinal'] == 7]))
print("No. of rating for month 8: ", len(dtrain['month_ordinal'][dtrain['month_ordinal'] == 8]))

Month 7 has the highest rated frequency and month 7 and 8 have the lowest rated frequencies

### Scatter plots for each month

In [None]:
figure, axis = plt.subplots(3, 4)
mon = np.arange(0,12).reshape(3, 4)

for i in range(0, 4):
    for j in range(0, 3):
        month = mon[j,i]
        sns.scatterplot(data=dtrain[dtrain["month_ordinal"] == month], 
                        x="userID_ordinal", y="itemID_ordinal", ax=axis[j,i])
        axis[j,i].set_title(f"Month {month}")
        axis[j,i].grid()

In [None]:
sns.histplot(dtrain.groupby('genre_ordinal')['rating'].mean())
plt.title("Genre Mean")
plt.grid()
plt.show()

### Barplot and Violinplot for each genre means

In [None]:
figure, axis = plt.subplots(ncols = 2)

sns.barplot(x=dtrain['genre_ordinal'],
                y=dtrain['rating'], ax=axis[0])
axis[0].grid()

sns.violinplot(x=dtrain["genre_ordinal"],
                y=dtrain['rating'], ax=axis[1])
axis[1].grid()

Each genre mean rating is moderate around 4 where genre 5 is the highest and genre 16 is the lowest.
The distributions of each genre are not similar especially for genre 5, 6, 10.The distributions of each genre are not similar especially for genre 5, 6, 10.

### Rated frequency for each genre

In [None]:
sns.histplot(data=dtrain, x='genre_ordinal')
plt.grid()
plt.show()

In [None]:
print("No. of rating for genre 6: ", len(dtrain['genre_ordinal'][dtrain['genre_ordinal'] == 6]))
print("No. of rating for genre 7: ", len(dtrain['genre_ordinal'][dtrain['genre_ordinal'] == 7]))
print("No. of rating for genre 10: ", len(dtrain['genre_ordinal'][dtrain['genre_ordinal'] == 10]))

Genre 6, 7, 10 have the fewest numbers of rating.

In [None]:
print("No. of rating for genre 13: ", len(dtrain['genre_ordinal'][dtrain['genre_ordinal'] == 13]))
print("No. of rating for genre 15: ", len(dtrain['genre_ordinal'][dtrain['genre_ordinal'] == 15]))

Genre 13, 15 have the highest numbers of rating.

### Scatter plots for each genre

In [None]:
figure, axis = plt.subplots(4, 5)
gen = np.arange(0,20).reshape(4, 5)

for i in range(0, 5):
    for j in range(0, 4):
        genre = gen[j,i]
        sns.scatterplot(data=dtrain[dtrain["genre_ordinal"] == genre],
                        x="userID_ordinal", y="itemID_ordinal", ax=axis[j,i])
        axis[j,i].set_title(f"Genre {genre}")
        axis[j,i].grid()

### Barplot of monthly mean for each genre

In [None]:
sns.barplot(data=dtrain, x='genre_ordinal', y='rating', hue='month_ordinal')
plt.title("Monthly Mean of each genre")
plt.grid()
plt.show()

Genre 1, 5, 6, 7, 10 are not rated for all months.

### Scatter plots of userID/itemID aganist their rated frequencies

In [None]:
user2item = {}
for u in dtrain["userID_ordinal"]:
    user2item[u] = np.array(dtrain["itemID_ordinal"][dtrain["userID_ordinal"] == u])

item2user = {}
for i in dtrain["itemID_ordinal"]:
    item2user[i] = np.array(dtrain["userID_ordinal"][dtrain["itemID_ordinal"] == i])

In [None]:
freq_user = {}
freq_item = {}

for i in user2item:
    freq_user[i] = len(user2item[i])
    
for i in item2user:
    freq_item[i] = len(item2user[i])

In [None]:
sns.scatterplot(freq_user.keys(), freq_user.values())
plt.title("User ID vs rated freq.")
plt.grid()
plt.show()

In [None]:
min_freq_user = min(freq_user.values())
lowerbound = [key for key in freq_user.keys() if freq_user[key] == min_freq_user]
print("No. of users who least often (%d time(s)), give ratings : %d" 
      %(min_freq_user, len(lowerbound)))

In [None]:
max_freq_user = max(freq_user.values())
upperbound = [key for key in freq_user.keys() if freq_user[key] == max_freq_user]
print("No. of users who most often (%d time(s)) give ratings : %d" 
      %(max_freq_user, len(upperbound)))
upperbound = upperbound[0]

In [None]:
dtrain[dtrain['userID_ordinal'] == upperbound]

After inspecting the most often rated user, it is confirmed that he/she is not a rate spammer.

In [None]:
sns.scatterplot(freq_item.keys(), freq_item.values())
plt.title("Item ID vs rated freq.")
plt.grid()
plt.show()

In [None]:
min_freq_item = min(freq_item.values())
lowerbound = [key for key in freq_item.keys() if freq_item[key] == min_freq_item]
print("No. of users who least often (%d time(s)) to give ratings : %d" 
      %(min_freq_item, len(lowerbound)))

In [None]:
max_freq_item = max(freq_item.values())
upperbound = [key for key in freq_item.keys() if freq_item[key] == max_freq_item]
print("No. of users who most often (%d time(s)) to give ratings : %d" 
      %(max_freq_item, len(upperbound)))
upperbound = upperbound[0]

In [None]:
dtrain[dtrain['itemID_ordinal'] == upperbound]

After inspecting the most often rated item, it is confirmed that it is not spammed with high or low ratings.

### Scatter plots of userID in social network aganist their no. of following/being followed

In [None]:
freq_following = {}
freq_followed = {}

for i in range(0, n_user):
    freq_following[i] = len(following[i])
    
for i in range(0, n_user):
    freq_followed[i] = len(followedby[i])

In [None]:
sns.scatterplot(freq_following.keys(), freq_following.values())
plt.title("userID vs. No. of following")
plt.grid()
plt.show()

In [None]:
min_following = min(freq_following.values())
lowerbound = [key for key in freq_following.keys() if freq_following[key] == min_following]
following_lowerbound = [key for key in freq_following.keys() if freq_following[key] == min_following + 1]
print("No. of users who do not follow others : %d" 
      %(len(lowerbound)))
print("No. of users who least often (%d time(s)) to follow others : %d" 
      %(min_following + 1, len(following_lowerbound)))

In [None]:
max_following = max(freq_following.values())
upperbound = [key for key in freq_following.keys() if freq_following[key] == max_following]
print("No. of users who are most often (%d time(s)) to follow others : %d" 
      %(max_following, len(upperbound)))
upperbound = upperbound[0]

In [None]:
dnetwork[dnetwork['from_ordinal'] == upperbound]

In [None]:
sns.scatterplot(freq_followed.keys(), freq_followed.values())
plt.title("userID vs. No. of being followed")
plt.grid()
plt.show()

In [None]:
min_followed = min(freq_followed.values())
lowerbound = [key for key in freq_followed.keys() if freq_followed[key] == min_followed]
followed_lowerbound = [key for key in freq_followed.keys() if freq_followed[key] == min_followed + 1]
print("No. of users who are not followed by others : %d" 
      %(len(lowerbound)))
print("No. of users who are least often (%d time) to be followed by others : %d" 
      %(min_followed + 1, len(followed_lowerbound)))

In [None]:
max_followed = max(freq_followed.values())
upperbound = [key for key in freq_followed.keys() if freq_followed[key] == max_followed]
print("No. of users who are most often (%d time(s)) to be followed by others : %d" 
      %(max_followed, len(upperbound)))
upperbound = upperbound[0]

In [None]:
dnetwork[dnetwork['to_ordinal'] == upperbound]

In [None]:
dtrain.sample(5).T

## Analysis using feature means

### Data pre-processing

In [None]:
train_tuple = np.array(dtrain[['userID_ordinal', 'itemID_ordinal', 'genre_ordinal', 'month_ordinal']].astype(int))
train_rating = np.array(dtrain['rating'].values)
test_tuple = np.array(dtest[['userID_ordinal', 'itemID_ordinal', 'genre_ordinal', 'month_ordinal']].astype(int))

### Global Mean

In [None]:
from baseline import glb_mean
glb_ave = glb_mean()
glb_ave.fit(train_rating)
pred_rating = glb_ave.predict(train_tuple)
print(f"RMSE for global mean: {rmse(pred_rating, train_rating)}")

### User Mean

In [None]:
from baseline import user_mean
user_ave = user_mean(n_user + 1)
user_ave.fit(train_tuple, train_rating)
pred_rating = user_ave.predict(train_tuple)
print(f"RMSE for user mean: {rmse(pred_rating, train_rating)}")

#### James-Stein Estimator for user mean

In [None]:
pred_rating = np.array(dtrain["userID_JS"].values)
print(f"RMSE for user mean: {rmse(pred_rating, train_rating)}")

### Item Mean

In [None]:
from baseline import item_mean
item_ave = item_mean(n_item + 1)
item_ave.fit(train_tuple, train_rating)
pred_rating = item_ave.predict(train_tuple)
print(f"RMSE for item mean: {rmse(pred_rating, train_rating)}")

#### James-Stein Estimator for item mean

In [None]:
pred_rating = np.array(dtrain["itemID_JS"].values)
print(f"RMSE for user mean: {rmse(pred_rating, train_rating)}")

### Genre Mean

In [None]:
from baseline import genre_mean
genre_ave = genre_mean(n_genre)
genre_ave.fit(train_tuple, train_rating)
pred_rating = genre_ave.predict(train_tuple)
print(f"RMSE for genre mean: {rmse(pred_rating, train_rating)}")

#### James-Stein Estimator for genre mean

In [None]:
pred_rating = np.array(dtrain["genre_JS"].values)
print(f"RMSE for user mean: {rmse(pred_rating, train_rating)}")

### Month mean

In [None]:
from baseline import month_mean
month_ave = month_mean(n_month)
month_ave.fit(train_tuple, train_rating)
pred_rating = month_ave.predict(train_tuple)
print(f"RMSE for month mean: {rmse(pred_rating, train_rating)}")

#### James-Stein Estimator for month mean

In [None]:
pred_rating = np.array(dtrain["month_JS"].values)
print(f"RMSE for user mean: {rmse(pred_rating, train_rating)}")

user and item mean are the lowest while month mean the the largest. JS estimator gives larger RMSE than \bar{x}

## Neural Network Model

### Latent Factor Model

In [None]:
import tensorflow as tf
from tensorflow import keras
from LFactorNet import LFactorNet

In [None]:
## compile
model = LFactorNet(num_users=n_user, num_items=n_item, 
                      num_genre=n_genre, num_month=n_month, 
                      embedding_size=400)

metrics = [keras.metrics.RootMeanSquaredError(name='rmse')]

model.compile(optimizer=keras.optimizers.Adam(1e-3), 
              loss=keras.losses.MeanSquaredError(),
              metrics=metrics
              )

In [None]:
train_tuple = np.array(dtrain[['userID_ordinal', 'itemID_ordinal', 
                               'genre_ordinal', 'month_ordinal']].values)

In [None]:
callbacks = [keras.callbacks.EarlyStopping( 
    monitor='val_rmse', min_delta=0, patience=3, verbose=1, 
    mode='min', baseline=None, restore_best_weights=True)]

history = model.fit(
    x=train_tuple,
    y=train_rating,
    batch_size=50,
    epochs=100,
    verbose=2,
    validation_split=.2)

In [None]:
test_pair = np.array(dtest[['userID_ordinal', 'itemID_ordinal', 
                            'genre_ordinal', 'month_ordinal']].values)
pred_rating = model.predict(test_pair)



The score is 1.79166 which shows it is completely insufficient to predict ratings by only using latent factor.

In [None]:
dtrain.sample(5).T

### TTowerRS with bias regularizer on dense layers and collective filtering layers

### TTowerRS CV

In [None]:
## user_cont_feat, item_cont_feat, user_cate_feat, item_cate_feat

train_input = [dtrain[['userID_freq', 'userID_mean', 'userID_JS', 'month_freq']].values,
               dtrain[['following_no', 'followedby_no', 'following_mean', 'follower_mean']].values,
               dtrain[['userID_ordinal', 'month_ordinal']].astype(int).values,
               dtrain[['itemID_ordinal', 'genre_ordinal']].astype(int).values
              ]

In [None]:
from TTowerRS import TTowerRS_CV, TTowerRS
tt_cv = TTowerRS_CV(n_user=n_user, n_item=n_item, n_genre=n_genre, n_month=n_month, embedding_sizes=[150, 200], dense_sizes=[150, 200], 
                 embed_regs=[1e-2], dense_regs=[1e-2], fc_regs=[1e-2])

tt_cv.grid_search(train_input, train_rating)

### Implementation with best parameter combination

In [None]:
model = TTowerRS(n_user=n_user, n_item=n_item, n_genre=n_genre, n_month=n_month, 
                 embedding_size=200, dense_size=200, 
                 embed_reg=1e-2, dense_reg=1e-2, fc_reg=1e-2)

metrics = [
    keras.metrics.RootMeanSquaredError(name='rmse')
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-3), 
    loss=tf.keras.losses.MeanSquaredError(), 
    metrics=metrics
)

callbacks = [keras.callbacks.EarlyStopping( 
    monitor='rmse', min_delta=0, patience=10, verbose=1, 
    mode='min', baseline=None, restore_best_weights=True)]

history = model.fit(
    x=train_input,
    y=train_rating,
    callbacks=callbacks,
    batch_size=64,
    epochs=100,
    verbose=2
)



In [None]:
## user_cont_feat, item_cont_feat, user_cate_feat, item_cate_feat

test_input = [dtest[['userID_freq', 'userID_mean', 'userID_JS', 'month_freq']].values,
              dtest[['following_no', 'followedby_no', 'following_mean', 'follower_mean']].values,
              dtest[['userID_ordinal', 'month_ordinal']].astype(int).values, 
              dtest[['itemID_ordinal', 'genre_ordinal']].astype(int).values
             ]

In [None]:
pred_rating = model.predict(test_input)

The score without network analysis indicator is 1.04297.

The score with network analysis indicator (as user continuous feature) is 1.14256.

In [None]:
# submission
sub = pd.Series(pred_rating.flatten())
sub = {"ID" : pd.Series(np.arange(0,13899)), "rating" : sub}
sub = pd.concat(sub, axis = 1)

In [None]:
sub.to_csv("sub.csv", index = False)

### TTowerRS with modified structure 

In [None]:
## user_cont_feat, item_cont_feat, network_cont_feat, user_cate_feat, item_cate_feat

train_input = [dtrain[['userID_freq', 'userID_mean', 'userID_JS', 'month_freq', 
                       'following_no', 'followedby_no', 'following_mean', 'follower_mean']].values,
               dtrain[['itemID_freq', 'itemID_mean', 'itemID_JS', 'genre_freq']].values,
               dtrain[['undirected_degree', 'directed_degree', 
                       'undirected_betweenness_centrality', 'directed_betweenness_centrality', 
                       'undirected_clust_coefficient', 'directed_clust_coefficient', 
                       'undirected_closeness_centrality', 'directed_closeness_centrality', 
                       'undirected_eigenvector_centrality', 'directed_eigenvector_centrality', 
                       'undirected_degree_centrality', 'directed_degree_centrality']].values, 
               dtrain[['userID_ordinal', 'month_ordinal']].astype(int).values,
               dtrain[['itemID_ordinal', 'genre_ordinal']].astype(int).values
               ]

### TTTowerRSNew CV

In [None]:
from TTowerRSNew import TTowerRSNew_CV, TTowerRSNew
tt_cv = TTowerRSNew_CV(n_user=n_user, n_item=n_item, n_genre=n_genre, n_month=n_month, embedding_sizes=[150, 200], dense_sizes=[150, 200], 
                 embed_regs=[1e-2], dense_regs=[1e-2], fc_regs=[1e-2])

tt_cv.grid_search(train_input, train_rating)

In [None]:
## user_cont_feat, item_cont_feat, network_cont_feat, user_cate_feat, item_cate_feat

train_input = [dtrain[['userID_freq', 'userID_mean', 'userID_JS', 'month_freq', 
                       'following_no', 'followedby_no', 'following_mean', 'follower_mean']].values,
               dtrain[['itemID_freq', 'itemID_mean', 'itemID_JS', 'genre_freq']].values,
               dtrain[['undirected_degree', 'directed_degree', 
                       'undirected_betweenness_centrality', 'directed_betweenness_centrality', 
                       'undirected_clust_coefficient', 'directed_clust_coefficient', 
                       'undirected_closeness_centrality', 'directed_closeness_centrality', 
                       'undirected_eigenvector_centrality', 'directed_eigenvector_centrality', 
                       'undirected_degree_centrality', 'directed_degree_centrality']].values, 
               dtrain[['userID_ordinal', 'month_ordinal']].astype(int).values,
               dtrain[['itemID_ordinal', 'genre_ordinal']].astype(int).values
               ]

In [None]:
model = TTowerRSNew(n_user=n_user, n_item=n_item, n_genre=n_genre, n_month=n_month, 
                 embedding_size=300, dense_size=300, 
                 embed_reg=1e-2, dense_reg=1e-2, fc_reg=1e-2)

metrics = [
    keras.metrics.RootMeanSquaredError(name='rmse')
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), 
    loss=tf.keras.losses.MeanSquaredError(), 
    metrics=metrics
)

callbacks = [keras.callbacks.EarlyStopping( 
    monitor='rmse', min_delta=0, patience=10, verbose=1, 
    mode='min', baseline=None, restore_best_weights=True)]

history = model.fit(
    x=train_input,
    y=train_rating,
    callbacks=callbacks,
    batch_size=64,
    epochs=100,
    verbose=2
)