In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Movie vector space

In [2]:
df_ml_movies = pd.read_csv('/home/ignacio/Datasets/Graph analysis/ml-valid-movies.csv')
df_ml_movies.head()

Unnamed: 0,movieId,title,year,genres,director,valid
0,1,Toy Story,1995,adventure|animation|children|comedy|fantasy,"John Lasseter, Lee Unkrich",True
1,7,Sabrina,1995,comedy|romance,"Billy Wilder, Sydney Pollack",True
2,12,Dracula: Dead and Loving It,1995,comedy|horror,Mel Brooks,True
3,16,Casino,1995,crime|drama,"Martin Scorsese, Martin Campbell",True
4,18,Four Rooms,1995,comedy,"Allison Anders, Alexandre Rockwell, Quentin Ta...",True


In [None]:
len(df_ml_movies)

#### define directors dict

In [None]:
directors = df_ml_movies['director']
directors = directors.str.split(r', ')
directors_list = list(directors.values)
directors_list = [d for director in directors_list for d in director]
directors = pd.DataFrame(directors_list, columns=['director'])
directors = directors.drop_duplicates()
directors['normalize'] = directors['director'].str.lower()
directors['normalize'] = directors['normalize'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
directors['normalize'] = directors['normalize'].str.replace(r'[\.\-\" ]', '')
directors = directors.drop_duplicates(subset=['normalize'])
directors = directors.set_index(['normalize'])
directors_dict = directors.to_dict()
directors.head(10)

In [None]:
directors_dict = directors_dict['director']

#### define "soap"

In [None]:
df_ml_movies['director'] = df_ml_movies['director'].str.lower()
df_ml_movies['director'] = df_ml_movies['director'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_ml_movies['director'] = df_ml_movies['director'].str.replace(r'[\.\-\" ]', '')
df_ml_movies['director'] = df_ml_movies['director'].str.replace(',', ' ')
df_ml_movies['genres'] = df_ml_movies['genres'].str.replace('|', ' ')
df_ml_movies.head()

In [None]:
df_ml_movies['soap'] = df_ml_movies.apply(lambda x : x['genres'] + ' ' + x['director'], axis=1)
df_ml_movies.head()

#### define vocabulary movie

In [None]:
vocabulary_movie = list(directors.index)
#vocabulary_movie = [x for x in vocabulary_movie if x != '']

In [None]:
len(vocabulary_movie)

In [None]:
genres = df_ml_movies['genres']
genres = genres.str.split(' ')
genres_list = list(genres.values)
genres_list = [g for gen in genres_list for g in gen]
genres = pd.DataFrame( genres_list, columns=['genre'])
genres = genres.drop_duplicates()
genres_list = list(genres['genre'].values)

In [None]:
len(genres)

In [None]:
vocabulary_movie.extend(genres_list)
len(vocabulary_movie)

#### tfidf vectorizer

In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_ml_movies['soap'])

In [None]:
len(count.get_feature_names())

In [None]:
tfidf = TfidfVectorizer(stop_words='english', vocabulary= vocabulary_movie)
tf_idf_matrix = tfidf.fit_transform(df_ml_movies['soap'])

In [None]:
directors.loc['tiborhernadi']

In [None]:
feature_names = tfidf.get_feature_names()

In [None]:
len(feature_names)

In [None]:
df_feature_name = pd.DataFrame(tfidf.get_feature_names(), columns=['feature name'])
df_feature_name.to_csv('/home/ignacio/Datasets/Graph analysis/feature_name.csv', index=False)

In [None]:
df_feature_name.head()

In [None]:
tf_idf_matrix

In [None]:
df_ml_movies[df_ml_movies['movieId']==1]

In [None]:
idx = df_ml_movies.index[df_ml_movies['movieId']==1].tolist()[0]

In [None]:
toy_story_tf_idf = tf_idf_matrix[idx:idx+1]

In [None]:
toy_story_matrix = toy_story_tf_idf.todense()

In [None]:
df_toy_story = pd.DataFrame(toy_story_matrix)
df_toy_story = df_toy_story.transpose()
df_toy_story.to_csv('/home/ignacio/Datasets/Graph analysis/toy_story.csv', index=False)
df_toy_story.head()

## Graph model

In [None]:
df_nodes = pd.read_csv('/home/ignacio/Datasets/Graph analysis/artists_nodes.csv')
df_edges = pd.read_csv('/home/ignacio/Datasets/Graph analysis/artists_edges.csv')

In [None]:
df_nodes = df_nodes.set_index('Artists')
type_dict = df_nodes.to_dict()
df_edges = df_edges.set_index(['Node 1', 'Node 2'])
color_dict = df_edges.to_dict()

In [None]:
g_artist = nx.Graph()
g_artist.add_nodes_from(list(type_dict['type'].keys()))
g_artist.add_edges_from(list(color_dict['color'].keys()))

In [None]:
nx.set_node_attributes(g_artist, name='type', values=type_dict['type'])
nx.set_edge_attributes(g_artist, name='color', values=color_dict['color'])

In [None]:
g_artist.has_node('Philip Saville')

In [None]:
df_nodes = df_nodes.reset_index()
df_nodes.head()

In [None]:
df_nodes[df_nodes['Artists'].str.contains('Philip')]

## Book vector space

In [None]:
df_bx_book = pd.read_csv('/home/ignacio/Datasets/Graph analysis/bx-valid-book.csv')
df_bx_book['common-shelves'] = df_bx_book['common-shelves'].fillna('')
df_bx_book.head()

In [None]:
len(df_bx_book)

#### define author dict

In [None]:
authors = df_bx_book['Book-Author']
authors_list = list(authors.unique())
df_authors = pd.DataFrame(authors_list, columns=['author'])
df_authors = df_authors.drop_duplicates()
df_authors['normalize'] = df_authors['author'].str.lower()
df_authors['normalize'] = df_authors['normalize'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_authors['normalize'] = df_authors['normalize'].str.replace(r'[\.\-\" ]', '')
df_authors = df_authors.drop_duplicates(subset=['normalize'])
df_authors = df_authors.set_index(['normalize'])
authors_dict = df_authors.to_dict()
df_authors.head(10)

#### define "soap"

In [None]:
df_bx_book['common-shelves'] = df_bx_book['common-shelves'].str.replace('|', ' ')
df_bx_book.head()

In [None]:
df_bx_book['Book-Author'] = df_bx_book['Book-Author'].str.lower()
df_bx_book['Book-Author'] = df_bx_book['Book-Author'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_bx_book['Book-Author'] = df_bx_book['Book-Author'].str.replace(r'[\.\-\" ]', '')
df_bx_book.head()

In [None]:
df_bx_book['soap'] = df_bx_book.apply(lambda row: row['Book-Author'] + ' ' + row['common-shelves'], axis = 1 )
df_bx_book.head()

#### define vocabulary

In [None]:
vocabulary_book = list(df_authors.index)
#vocabulary_book = [x for x in vocabulary_book if x != '']

In [None]:
len(vocabulary_book)

In [None]:
authors_dict = df_authors.to_dict()['author']

In [None]:
vocabulary_book.extend(list(genres['genre'].values))
len(vocabulary_book)

#### tfidf vectorizer

In [None]:
tfidf_book = TfidfVectorizer(vocabulary=vocabulary_book)
#tfidf_book = TfidfVectorizer()
tfidf_book_matrix = tfidf_book.fit_transform(df_bx_book['soap'])
tfidf_book_matrix

In [None]:
feature_names_book = tfidf_book.get_feature_names()
df_features_name_book = pd.DataFrame(feature_names_book, columns=['feature names'])
df_features_name_book.head()

In [None]:
df_features_name_book.to_csv('/home/ignacio/Datasets/Graph analysis/feature_name_book.csv', index=False)

In [None]:
tfidf_pride = tfidf_book_matrix[4:5]
tfidf_pride

In [None]:
tfidf_pride = tfidf_book_matrix[4:5]
df_pride_tfidf = pd.DataFrame(tfidf_pride.todense())
df_pride_tfidf = df_pride_tfidf.transpose()
df_pride_tfidf.head()

In [None]:
df_pride = df_features_name_book['feature names'].apply( lambda x : authors_dict[x] if x in authors_dict.keys() else x )
df_pride.head(10)

In [None]:
df_pride = pd.DataFrame(df_pride, columns=[df_pride.name])
df_pride['tfidf'] = df_pride_tfidf[0]
df_pride.head()

In [None]:
df_pride.to_csv('/home/ignacio/Datasets/Graph analysis/pride_tfidf.csv', index=False)

In [None]:
df_authors = df_bx_book['Book-Author']
df_authors = df_authors.drop_duplicates()
df_authors.head()

## User vector space

#### User space book

In [None]:
matrix_user_profiles = np.load('/home/ignacio/Datasets/Graph analysis/user_book_space.npy')
matrix_user_profiles.shape

In [None]:
matrix_user_profiles[0].reshape(1,-1)

#### User space from rating matrix

In [3]:
df_movie_ratings = pd.read_csv('/home/ignacio/Datasets/Graph analysis/ml-ratings.csv')
df_movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1339,3.5,1260759125
1,1,1343,2.0,1260759131
2,1,1953,4.0,1260759191
3,1,2455,2.5,1260759113
4,1,3671,3.0,1260759117


In [5]:
ids = (df_movie_ratings[df_movie_ratings['userId'] == 1]['movieId']).tolist()

In [6]:
ids

[1339, 1343, 1953, 2455, 3671]

#### Define movie rated by user 1 (example)

In [7]:
df_movies_rated = df_ml_movies[df_ml_movies['movieId'].isin(ids)]
df_movies_rated

Unnamed: 0,movieId,title,year,genres,director,valid
75,1339,Dracula,1992,fantasy|horror|romance|thriller,"Philip Saville, Francis Ford Coppola, Mel Brooks",True
76,1343,Cape Fear,1991,thriller,"Martin Scorsese, J. Lee Thompson",True
109,1953,The French Connection,1971,action|crime|thriller,William Friedkin,True
144,2455,The Fly,1986,drama|horror|sci-fi|thriller,"David Cronenberg, Kurt Neumann, Chris Walas",True
214,3671,Blazing Saddles,1974,comedy|western,Mel Brooks,True


In [8]:
df_ratings_user = df_movie_ratings[df_movie_ratings['userId'] == 1]
df_ratings_user

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1339,3.5,1260759125
1,1,1343,2.0,1260759131
2,1,1953,4.0,1260759191
3,1,2455,2.5,1260759113
4,1,3671,3.0,1260759117


In [9]:
df_ratings_user = pd.merge(df_ratings_user[['userId','movieId', 'rating']], df_movies_rated[['movieId', 'title', 'director']], right_on='movieId', left_on='movieId')
df_ratings_user

Unnamed: 0,userId,movieId,rating,title,director
0,1,1339,3.5,Dracula,"Philip Saville, Francis Ford Coppola, Mel Brooks"
1,1,1343,2.0,Cape Fear,"Martin Scorsese, J. Lee Thompson"
2,1,1953,4.0,The French Connection,William Friedkin
3,1,2455,2.5,The Fly,"David Cronenberg, Kurt Neumann, Chris Walas"
4,1,3671,3.0,Blazing Saddles,Mel Brooks


In [10]:
df_ratings_user.to_csv('/home/ignacio/Datasets/Graph analysis/Recommendation/rating_user_example.csv', index=False)

#### Define vector user space (from movies)

In [None]:
item_profiles_list = []
for i in ids: 
    idx = df_ml_movies.index[df_ml_movies['movieId']==i].tolist()[0]
    item_profiles_list.append(tf_idf_matrix[idx:idx+1])
item_profiles_list

In [None]:
item_profiles = scipy.sparse.vstack(item_profiles_list)
item_profiles

In [None]:
ratings = (df_movie_ratings[df_movie_ratings['userId'] == 1]['rating']).reshape(-1,1)
ratings

In [None]:
from sklearn.preprocessing import normalize

user_item_weighted_avg = np.sum(item_profiles.multiply(ratings), axis=0) / np.sum(ratings)
user_profile_norm = normalize(user_item_weighted_avg)
user_profile_norm

In [None]:
user_profile_norm.shape

In [None]:
users_profiles = [user_item_weighted_avg, user_profile_norm]

In [None]:
np.stack(users_profiles)

In [None]:
user_profile_norm.shape

In [None]:
df_feature_name = pd.DataFrame(tfidf.get_feature_names(), columns=['feature name'])
df_feature_name.head()

In [None]:
df_user_item_space = pd.DataFrame(user_profile_norm)
df_user_item_space = df_user_item_space.transpose()
df_user_item_space.head()

In [None]:
len(df_feature_name)

In [None]:
#df_user_item_space = df_user_item_space.rename(index= str, columns = {0:'tfidf'})
df_feature_name['tfidf'] = df_user_item_space[0]
df_user_item_space = df_feature_name
df_user_item_space.head()

In [None]:
df_user_item_space['feature name'] = df_user_item_space['feature name'].apply( lambda x : directors_dict[x] if x in directors_dict.keys() else x )
df_user_item_space.head(10)

In [None]:
df_user_item_space.to_csv('/home/ignacio/Datasets/Graph analysis/user_item_space.csv', index=False)

## Influence path

#### Book to movie - space 

In [None]:
df_pride.head()

In [None]:
directors_influence_pride = directors.copy()
directors_influence_pride = directors_influence_pride.reset_index(drop=True)
directors_influence_pride.head()

In [None]:
directors_influence_pride['short path'] = directors_influence_pride['director'].apply(lambda x : nx.shortest_path_length(G=g_artist, source=x, target=df_bx_book.iloc[4]['Book-Author']) if g_artist.has_node(x) else -1) 
directors_influence_pride.head()

In [None]:
directors_influence_pride = directors_influence_pride.set_index('director')
directors_influence_proof = directors_influence_pride.head()
directors_influence_proof.head()

In [None]:
directors_influence_proof

In [None]:
type(directors_influence_pride.loc['John Lasseter']['short path'])

In [None]:
df_pride['tfidf'] = df_pride.apply(lambda row : 1.0/(directors_influence_pride.loc[row['feature name']]['short path']) if (row['feature name'] in directors_influence_pride.index) else row['tfidf'], axis= 1 )
df_pride.head()

In [None]:
df_pride['tfidf'] = df_pride['tfidf'].replace(-1.0, 0.0)
df_pride.head()

In [None]:
df_pride[df_pride['feature name'] == 'John Lasseter']

In [None]:
tfidf_pride = df_pride['tfidf'].as_matrix()

In [None]:
scipy.sparse.csr_matrix(tfidf_pride)

In [None]:
df_pride.to_csv('/home/ignacio/Datasets/Graph analysis/pride_space.csv', index=False)

#### User to book-space

In [None]:
df_user_item_space.head()

In [None]:
directors.head()

In [None]:
len(directors)

In [None]:
directors.loc['']

In [None]:
len(df_user_item_space)

In [None]:
len(df_authors)

In [None]:
df_authors_shortest_path = pd.DataFrame(df_authors['author'], columns=['author'])
df_authors_shortest_path = df_authors_shortest_path.reset_index(drop=True)
df_authors_shortest_path.head()

In [None]:
def tfidf_author(x, df_feature):
    df = df_feature.copy()
    df['shortest path'] = df.apply(lambda row : row['tfidf']* 1./(nx.shortest_path_length(G=g_artist, source=x, target=row['feature name'])) if g_artist.has_node(row['feature name']) and row['tfidf']!= 0.0 else 0.0, axis= 1 )
    return df['shortest path'].sum()

In [None]:
df_authors_shortest_path['tfidf'] = df_authors_shortest_path['author'].apply(tfidf_author, df_feature=df_user_item_space)
df_authors_shortest_path.head()

In [None]:
len(df_authors_shortest_path)

In [None]:
df_user_item_space_without_directors = df_user_item_space[~df_user_item_space['feature name'].isin(directors['director'])]
len(df_user_item_space_without_directors)

In [None]:
df_authors_shortest_path = df_authors_shortest_path.rename(index=str, columns={'author': 'feature name'})
df_authors_shortest_path.head()

In [None]:
df_user_item_space_book = pd.concat([df_authors_shortest_path, df_user_item_space_without_directors])
df_user_item_space_book.head()

In [None]:
len(df_user_item_space_book)

In [None]:
tfidf_user_matrix = df_user_item_space_book['tfidf'].as_matrix()
tfidf_user_matrix

In [None]:
tfidf_user_matrix.reshape((1, -1)).shape

In [None]:
tfidf_user_matrix.shape

## Content Based recomendation

In [None]:
from sklearn.metrics.pairwise import  cosine_similarity

In [None]:
sim = cosine_similarity(user_profile_norm, tf_idf_matrix)
sim_index = sim.argsort().flatten()[-100:]
sim_index

In [None]:
similar_items = sorted([(df_ml_movies.iloc[i]['movieId'], sim[0,i]) for i in sim_index], key=lambda x: -x[1])
df_recommendation = pd.DataFrame(similar_items, columns=['id', 'similarity'])
df_recommendation.head()

In [None]:
len(df_recommendation)

In [None]:
df_recommendation = pd.merge(df_recommendation, df_ml_movies, how='left', left_on='id', right_on='movieId')
df_recommendation = df_recommendation[['movieId', 'title', 'year', 'genres', 'director', 'similarity']]
df_recommendation.head()

In [None]:
df_recommendation.columns

In [None]:
df_data_recommendation = df_ml_movies[['movieId','title','genres','director']][df_ml_movies['movieId'].isin(df_recommendation['id'])]
df_data_recommendation.head()

In [None]:
len(df_data_recommendation)

In [None]:
df_ml_movies.iloc[sim_index[10]]['movieId']

## Cross Content Based recommendation

In [None]:
tfid_matrix_book = scipy.sparse.load_npz('/home/ignacio/Datasets/Graph analysis/tfidf_matrix_target.npz')
tfid_matrix_book

In [None]:
sim_book = cosine_similarity(user_profile_norm, tfid_matrix_book)
sim_book_index = sim_book.argsort().flatten()[-100:]
sim_book_index

In [None]:
similar_book_items = sorted([(df_bx_book.iloc[i]['ISBN'], sim_book[0,i]) for i in sim_book_index], key=lambda x: -x[1])
df_book_recommendation = pd.DataFrame(similar_book_items, columns=['ISBN', 'similarity'])
df_book_recommendation.head()

In [None]:
df_book_recommendation = pd.merge(df_book_recommendation, df_bx_book, how='left', left_on='ISBN', right_on='ISBN')
df_book_recommendation.head()

In [None]:
df_book_recommendation = df_book_recommendation[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'shelves', 'similarity']]
df_book_recommendation.head(10)

## Target recommendation filtering

#### Book to origen space

In [None]:
df_recommendation_book_to_origen = pd.read_csv('/home/ignacio/Datasets/Graph analysis/Recommendation/recommendation_book_to_origen_space.csv' )
df_recommendation_book_to_origen.head()

In [None]:
df_recommendation_book_to_origen = df_recommendation_book_to_origen.drop(['Unnamed: 0'], axis=1)
df_recommendation_book_to_origen.head()

In [None]:
df_recommendation_book_to_origen = df_recommendation_book_to_origen.drop([1,9], axis=0)
df_recommendation_book_to_origen.head(10)

In [None]:
df_recommendation_book_to_origen.head(10).to_csv('/home/ignacio/Datasets/Graph analysis/Recommendation/recommendation_book_to_origen_space_cleaned.csv', index=False )

#### User to target space

In [None]:
df_recommendation_target_space = pd.read_csv('/home/ignacio/Datasets/Graph analysis/Recommendation/recommendation_user_to_target_space.csv' )
df_recommendation_target_space.head(10)

In [None]:
df_recommendation_target_space = df_recommendation_target_space.drop(['Unnamed: 0'], axis=1)
df_recommendation_target_space.head()

In [None]:
df_recommendation_target_space = df_recommendation_target_space.drop([1,2,3,4,5,6,11], axis=0)
df_recommendation_target_space.head(10)

In [None]:
df_recommendation_target_space.head(10).to_csv('/home/ignacio/Datasets/Graph analysis/Recommendation/recommendation_user_to_target_space_cleaned.csv', index=False )