In [None]:
from asyncore import dispatcher
from itertools import permutations
from operator import index

import pandas as pd
import networkx as nx
import graspologic as gp
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import altair as alt
from fontTools.subset import subset
#from pandas.conftest import axis
from sklearn.manifold import TSNE
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from fontTools.varLib.models import allNone
from jedi.inference.utils import to_list

# DATA LOADING

# Load movie data
# movieId, title, genres
moviesInfo = pd.read_csv('movies.csv')
# Load ratings
# userId, movieId, rating, timestamp
ratingsInfo = pd.read_csv('ratings.csv')


In [None]:
# number of movie ratings - count and mean

rattingsStats = (ratingsInfo.groupby('movieId', as_index=False))
rattingsStats = rattingsStats.agg({'rating':['mean','count']})

rattingsStats.columns = [' '.join(col).strip() for col in rattingsStats.columns.values]
rattingsStats = rattingsStats.merge(moviesInfo[['movieId','title']], on='movieId')


#print(rattingsStats[rattingsStats['movieId']==72])


In [None]:
# number of user ratings - count and mean

usersStats = (ratingsInfo.groupby('userId', as_index=False))
usersStats = usersStats.agg({'rating':['mean','count']})

usersStats.columns = [' '.join(col).strip() for col in usersStats.columns.values]


In [None]:
# graph construction: Bipartite graph

ratingsInfo = ratingsInfo.rename(columns={'rating':'weight'})
# other way
# ratingsInfo.rename(columns={'rating':'weight'},inplace=True)

# merge tables
ratingsInfo = ratingsInfo.merge(moviesInfo[['movieId','title']], on='movieId')

trash = 10
ratingsInfo = ratingsInfo.groupby(['movieId']).filter(lambda numOfRattings: len(numOfRattings) >= trash)

# construct the (directed) graph
edgeFeatures = {
    'source': 'userId',
    'target': 'title',
    'edge_attr': 'weight'
}
Gbp = nx.from_pandas_edgelist(ratingsInfo,**edgeFeatures , create_using=nx.DiGraph)

movieNodes = []
userNodes = []
movieUserNodes = list(Gbp.nodes())

movieNodes = [node for node in movieUserNodes if type(node)==str]
userNodes = [node for node in movieUserNodes if type(node)!=str]

print(f'Broj cvorova[{len(movieUserNodes)}] = broj korisnika[{len(userNodes)}] + broj flmova[{len(movieNodes)}]')


In [None]:
# graph feature analyse

# Adjacency matrix
A = nx.to_numpy_array(Gbp)

# ASE - adjacency spectral embeddings
ase = gp.embed.AdjacencySpectralEmbed(n_elbows=2)
(Xout, Xin) = ase.fit_transform(A)

# users and movie embeddings

# 1) Movie embeddings
movieNodesMask = [type(node)==str for node in movieUserNodes]
movieEmbeddings = Xin[movieNodesMask]

# column names for embedding dimensions
embddingColsMovies = [f"x{col}" for col in range(Xin.shape[1])]


df_XinMovies = pd.DataFrame(data=movieEmbeddings, index=movieNodes, columns=embddingColsMovies)
df_XinMovies = df_XinMovies.reset_index().rename(columns={'index':'title'})

# 2) Users embeddings

usersNodesMask = [type(node)!=str for node in movieUserNodes]
usersEmbeddings = Xin[usersNodesMask]

# column names for embedding dimensions
embeddingColsUsers = [f"x{col}" for col in range(Xout.shape[1])]

df_XoutUsers = pd.DataFrame(data=usersEmbeddings, columns=embeddingColsUsers)

# prepare for visualization

# TSNE = t-Distributed Stochastic Neighbor Embedding
X_TSNE_Movies = TSNE(n_components=2).fit_transform(movieEmbeddings)
X_TSNE_User = TSNE(n_components=2).fit_transform(usersEmbeddings)

In [None]:
# visualisation

# format data frame
df_X_TSNE_Movies = pd.DataFrame(data=X_TSNE_Movies, index=movieNodes,columns=['x0','x1'])

# change index with title
df_X_TSNE_Movies = df_X_TSNE_Movies.reset_index().rename(columns={'index':'title'})

# merge with movies statistics count and mean

df_X_TSNE_Movies = df_X_TSNE_Movies.merge(rattingsStats[['title','rating mean','rating count']],on='title')

# Paziti na ovaj kod, njega treba srediti
display(df_X_TSNE_Movies)
alt.Chart(df_X_TSNE_Movies).mark_circle().encode(x='x0', y='x1', tooltip='title', color='rating count').properties(width=800,height=600).interactive()

# we can do same analise for users

In [None]:
# NEW SECTION
# NOW WE WILL CONSTRUCT CORRELATION MATRIX

movieMatrix = pd.pivot_table(ratingsInfo, values='weight', columns='title', index='userId')
#movieMatrix = ratingsInfo.groupby(['userId', 'movieId'])['weight'].mean().unstack()
#print(ratingsInfo)

movieRand = movieMatrix.sample(frac=1, random_state=42)
n = int(0.8 * len(movieRand))

x = np.array(movieRand.index)
x = x - np.mean(x)
autocor = np.correlate(x,x,'full')
autocor /= autocor.max()   # normalizacija
mid = len(autocor) // 2
autocor = autocor[mid:]        # pozitivni lagovi

#print(autocor.shape)
# lagovi
lags = np.arange(len(autocor))
#print(lags.shape)
# plot
plt.figure(figsize=(10,4))
plt.plot(lags, autocor)
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation Function (ACF)')
plt.show()


movieTraining = movieRand.iloc[:n]
movieTest = movieRand.iloc[n:]
print(movieMatrix)


# Now we compute users correlation
# We can specify the minimum number of users  who rate the same movie
# and for that movies we will calculate correlation
minUsers = 10

corrMatrix = movieTraining.corr(min_periods=minUsers)
corrMatrix.fillna(0, inplace=True)
#print(corrMatrix.shape)


# Now we will reduce correlation matrix
# 1) retain nMax most correlated neighbors
# 2) all diagonal elements will be zero
# 3) normalization by its spectral radius

nMax = 35
ranked = corrMatrix.rank(axis=1, method='min', ascending=False)
mask = ranked>nMax
corrMatrix = corrMatrix.mask(mask, 0)

#print(corrMatrix.shape)

In [None]:
# GRAPH CONSTRUCTION

G = nx.from_pandas_adjacency(corrMatrix, create_using=nx.Graph)

# Because of our conditions nMax and minUsers there can happen isolated
# node in graph. We will analise connected components and get the largest
# for the further analise

ConnectedComponents = nx.connected_components(G)
# sort increasingly
ConnectedComponents = sorted(ConnectedComponents, key = len, reverse=True)
G = G.subgraph(ConnectedComponents[0])

# remove diagonal elements and set to zero (we want to remove self-loops)
W = nx.to_numpy_array(G)

for i in range(W.shape[0]):
    for j in range(W.shape[1]):
        if i == j:
            W[i][j] = 0

eigValuesW,_= np.linalg.eig(W)

idx = np.argsort(np.abs(eigValuesW))[::-1]
eigValuesW = eigValuesW[idx]

W = W/abs(eigValuesW[0])

plt.figure(figsize=(10,10))
sns.heatmap(W, cmap='Greys')



In [None]:
# function for data preparing

def prepareData(movie,userMovieScoreMatrix,moviesInGraph,W):
    userMovieScoreMatrix = userMovieScoreMatrix.dropna(axis=0,subset=[movie])
    userMovieScoreMatrix = userMovieScoreMatrix[moviesInGraph]

    #############
    # format set
    #############
    ratingsMovie = userMovieScoreMatrix[movie]

    userMovieScoreMatrix[movie] = 0
    xData = userMovieScoreMatrix\
    .fillna(0)\
    .to_numpy()\
    .reshape(
        len(userMovieScoreMatrix),
        1,
        W.shape[0]
    )

    userMovieScoreMatrix[:] = 0
    userMovieScoreMatrix[movie] = ratingsMovie

    yData = userMovieScoreMatrix\
    .fillna(0)\
    .to_numpy()\
    .reshape(
        len(userMovieScoreMatrix),
        1,
        W.shape[0]
    )
    print(yData.shape)



    return xData,yData


In [None]:
#################################
 ### Fully connected network ###
#################################


movie = 'Zack and Miri Make a Porno (2008)'
gList = list(G.nodes())

xDataTrain, yDataTrain = prepareData(movie,movieTraining,gList,W)

#yDataTest = prepareData(movie,movieTest,gList,W)


