In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD 

In [2]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']

In [3]:
df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)

In [4]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [6]:
movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
combined_movies = pd.merge(df, movie_names, on='item_id')

combined_movies.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [8]:
combined_movies.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [9]:
combined_movies[combined_movies['item_id'] == 50]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

# Building utility matrix

In [10]:
rating_crosstab = combined_movies.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)

rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [12]:
rating_crosstab.shape

(943, 1664)

In [15]:
#transposing the utility matrix

X = rating_crosstab.values.T

X.shape

(1664, 943)

In [17]:
# decomposing the matrix

svd = TruncatedSVD(n_components=12, random_state=17)

resultant_matrix = svd.fit_transform(X)  #this compresses users to 12 

resultant_matrix.shape

(1664, 12)

In [20]:
corr_mat = np.corrcoef(resultant_matrix)  #generate the correlation matrix 

corr_mat.shape

(1664, 1664)

In [24]:
movie_names = rating_crosstab.columns # extract movie_names from crosstab variable

movies_list = list(movie_names)  # change movie names to numpy array

In [25]:
star_war = movies_list.index('Star Wars (1977)') # retrieving the index of star wars

In [26]:
print(star_war)

1398


In [27]:
corr_mat(star_war)

TypeError: 'numpy.ndarray' object is not callable