# En este ejemplo, crearemos un sistema de recomendaciones que recomiende películas a los usuarios
#### Utilizaremos el conjunto de datos Movielens. Un conjunto de datos gratuito que contiene miles de calificaciones de películas por parte de los usuarios.

### Filtro colaborativo con el dataset de movielens
#### Vamos a utilizar la correlación entre las puntuaciones de los usuarios junto con las características.

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
movies = pd.read_table('movies.dat', header=None, sep='::', 
                          names=['movie_id', 'movie_title', 'movie_genre'], engine='python')
movies = pd.concat([movies, movies.movie_genre.str.get_dummies(sep='|')], axis=1)

In [5]:
movie_categories = movies.columns[3:].tolist()

In [6]:
movies.head()

Unnamed: 0,movie_id,movie_title,movie_genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
movie_categories

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [8]:
movies.tail()

Unnamed: 0,movie_id,movie_title,movie_genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
3878,3948,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3882,3952,"Contender, The (2000)",Drama|Thriller,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
ratings = pd.read_table('ratings.dat', header=None, sep='::', 
                          names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')

In [10]:
#eliminamos atributo timestamp
del ratings['timestamp']

### añadir el título de la película haciendo un join entre el movie_id de la tabla ratings y el movie_id de la tabla movies

In [11]:
ratings = pd.merge(ratings, movies, on='movie_id')[['user_id', 'movie_title', 'movie_id','rating', 'movie_genre']]

ratings.head()

ratings.tail()

Unnamed: 0,user_id,movie_title,movie_id,rating,movie_genre
1000204,5949,Modulations (1998),2198,5,Documentary
1000205,5675,Broken Vessels (1998),2703,3,Drama
1000206,5780,White Boys (1999),2845,1,Drama
1000207,5851,One Little Indian (1973),3607,5,Comedy|Drama|Western
1000208,5938,"Five Wives, Three Secretaries and Me (1998)",2909,4,Documentary


In [12]:
ratings_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_title')
ratings_matrix

movie_title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kj�rlighetens kj�tere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,2.0
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,4.0,,,,,,3.0,...,,,,,,,,,,


### obtenemos matriz de ratings para cada user_id

### normalizamos la matriz de puntuaciones
#### si una celda no tiene valor es porque un determinado usuario no ha votado esa película
#### en esa celda la rellenamos con la calificación promedio de los usuarios para esa pelicula
#### obtener la puntuación de la película restando la calificación promedio global de la película

In [13]:
ratings_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_title')  

ratings_matrix = ratings_matrix.apply(lambda x: x.fillna(x.mean()),axis=0)
ratings_matrix = ratings_matrix.apply(lambda x: x - x.mean(),axis=1)

movies_index = ratings_matrix.columns

ratings_matrix.head()

movie_title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kj�rlighetens kj�tere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.215674,0.128728,-0.550393,-0.33181,0.470867,-0.742701,0.180156,0.35376,-0.195997,1.052754,...,0.390211,0.147801,0.057299,0.133446,0.257299,0.171092,0.50813,0.257299,-0.720962,0.013397
2,-0.210064,0.134337,-0.544784,-0.3262,0.476476,-0.737091,0.185766,0.359369,-0.190388,1.058363,...,0.39582,0.15341,0.062909,0.139055,0.262909,0.176702,0.513739,0.262909,-0.715352,0.019006
3,-0.213476,0.130926,-0.548195,-0.329612,0.473065,-0.740503,0.182354,0.355957,-0.193799,1.054952,...,0.392409,0.149999,0.059497,0.135644,0.259497,0.17329,0.510328,0.259497,-0.718764,0.015595
4,-0.212675,0.131726,-0.547394,-0.328811,0.473866,-0.739702,0.183155,0.356758,-0.192999,1.055752,...,0.393209,0.150799,0.060298,0.136445,0.260298,0.174091,0.511128,0.260298,-0.717963,0.016395
5,-0.182296,0.162105,-0.517016,-0.298432,0.504244,-0.709323,0.213534,0.387137,-0.16262,1.086131,...,0.423588,0.181178,0.090677,0.166823,0.290677,0.20447,0.541507,0.290677,-0.687584,0.046774


### Para computar la similaridad entre películas, una manera de hacerlo es  calcular la correlación entre ellas en función de la puntuación que dan los usuarios.

#### transpuesta de la matrix de ratings
##### Para calcular la correlación se transpone la matriz para que las películas sean las filas y los usuarios las columnas 
#### Usamos la matriz traspuesta de ratings_matrix para que la función np.corrcoef nos devuelva la correlación entre películas.

In [14]:
ratings_matrix.T

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",-0.215674,-0.210064,-0.213476,-0.212675,-0.182296,-0.217548,-0.216115,-0.221737,-0.208101,-0.262994,...,-0.210600,-0.217917,-0.213876,-0.212583,-0.164316,-0.143992,-0.197548,-0.210879,-0.209104,-0.184900
'Night Mother (1986),0.128728,0.134337,0.130926,0.131726,0.162105,0.126854,0.128286,0.122664,0.136300,0.081407,...,0.133802,0.126485,0.130525,0.131818,0.180086,-0.171019,0.146853,0.133523,0.135298,0.159501
'Til There Was You (1997),-0.550393,-0.544784,-0.548195,-0.547394,-0.517016,-0.552267,-0.550835,-0.556457,-0.542821,-0.597714,...,-0.545319,-0.552636,-0.548596,-0.547303,-0.499035,-0.478711,-0.532268,-0.545598,-0.543823,-0.519620
"'burbs, The (1989)",-0.331810,-0.326200,-0.329612,-0.328811,-0.298432,-0.333684,-0.332251,-0.337873,-0.324237,0.709979,...,-0.326736,-0.334053,-0.330012,-0.328719,-0.280452,-0.260128,-0.313684,-0.327015,-0.325240,-0.301036
...And Justice for All (1979),0.470867,0.476476,0.473065,0.473866,0.504244,0.468993,0.470426,0.464803,0.478439,0.423546,...,0.475941,0.468624,0.472664,0.473958,0.522225,0.542549,0.488992,0.475662,0.477437,0.501641
1-900 (1994),-0.742701,-0.737091,-0.740503,-0.739702,-0.709323,-0.744575,-0.743142,-0.748764,-0.735128,-0.790021,...,-0.737627,-0.744944,-0.740903,-0.739610,-0.691343,-0.671019,-0.724576,-0.737906,-0.736131,-0.711927
10 Things I Hate About You (1999),0.180156,0.185766,0.182354,0.183155,0.213534,0.178282,0.179715,0.174093,0.187729,0.132836,...,0.185230,0.177913,0.181954,0.183247,0.231514,-1.171019,0.198282,0.184951,0.186726,0.210930
101 Dalmatians (1961),0.353760,0.359369,0.355957,0.356758,0.387137,0.351885,0.353318,0.347696,0.361332,0.306439,...,0.358833,0.755056,0.355557,0.356850,0.405117,0.828981,0.371885,0.358554,0.360329,0.384533
101 Dalmatians (1996),-0.195997,-0.190388,-0.193799,-0.192999,-0.162620,-0.197872,-0.196439,-0.202061,-0.188425,-0.243318,...,-0.190924,-0.198241,-0.194200,-0.192907,-2.191343,-0.124316,-0.177872,-0.191203,-0.189428,-0.165224
12 Angry Men (1957),1.052754,1.058363,1.054952,1.055752,1.086131,1.050880,1.052312,1.046690,1.060326,-0.290021,...,1.057828,1.050511,1.054551,1.055844,1.104111,1.124436,0.775424,1.057549,1.059324,1.788073


### Ahora que tenemos una matriz normalizada de puntuaciones de usuarios, podemos calcular las relaciones entre películas calculando la correlación entre películas y puntuaciones.

### Utilizaremos el coeficiente de correlación producto-momento de Pearson (PPMC) para calcular las similitudes entre películas basadas en la relación de las puntuaciones que los usuarios les dan.

### Matriz de correlación o de similitud
### Aplicamos # Pearson product-moment correlation coefficient (PPMC)
#### https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

### El PMCC tiene un valor entre -1 y 1 que mide cuán relacionadas están un par de variables cuantitativas.

### La matriz de correlación es una matriz de tamaño m x m, donde el elemento Mij representa la correlación entre el item i y el item j.


In [15]:
correlation_matrix = np.corrcoef(ratings_matrix.T)  
correlation_matrix.shape

(3706, 3706)

In [16]:
#El resultado de llamar al método corrcoef es una matriz con los coeficientes de correlación 
#o similitud de cada película con las demás
correlation_matrix

array([[  1.00000000e+00,   3.38790195e-02,   5.68837908e-02, ...,
          2.66382718e-01,   1.05257034e-01,  -2.48429774e-02],
       [  3.38790195e-02,   1.00000000e+00,  -1.32194687e-02, ...,
          1.00183847e-01,   3.26130706e-02,  -1.57568962e-02],
       [  5.68837908e-02,  -1.32194687e-02,   1.00000000e+00, ...,
          1.92041086e-01,   7.85162956e-02,   1.65494058e-04],
       ..., 
       [  2.66382718e-01,   1.00183847e-01,   1.92041086e-01, ...,
          1.00000000e+00,   3.15144002e-01,  -3.21763262e-02],
       [  1.05257034e-01,   3.26130706e-02,   7.85162956e-02, ...,
          3.15144002e-01,   1.00000000e+00,  -1.02228164e-02],
       [ -2.48429774e-02,  -1.57568962e-02,   1.65494058e-04, ...,
         -3.21763262e-02,  -1.02228164e-02,   1.00000000e+00]])

In [17]:
movies_index

Index(['$1,000,000 Duck (1971)', ''Night Mother (1986)',
       ''Til There Was You (1997)', ''burbs, The (1989)',
       '...And Justice for All (1979)', '1-900 (1994)',
       '10 Things I Hate About You (1999)', '101 Dalmatians (1961)',
       '101 Dalmatians (1996)', '12 Angry Men (1957)',
       ...
       'Young Poisoner's Handbook, The (1995)', 'Young Sherlock Holmes (1985)',
       'Young and Innocent (1937)', 'Your Friends and Neighbors (1998)',
       'Zachariah (1971)', 'Zed & Two Noughts, A (1985)', 'Zero Effect (1998)',
       'Zero Kelvin (Kj�rlighetens kj�tere) (1995)', 'Zeus and Roxanne (1997)',
       'eXistenZ (1999)'],
      dtype='object', name='movie_title', length=3706)

#### Una vez tenemos la matriz, si queremos encontrar películas similares a una concreta, 
#### solo tenemos que encontrar las películas con una correlación alta con ésta

### solo devolvemos las películas con la mayor correlación con Return with Honor (1998)


In [18]:
favourite_movie_title = 'Return with Honor (1998)'

favourite_movie_index = list(movies_index).index(favourite_movie_title)

P = correlation_matrix[favourite_movie_index]

list(movies_index[(P>0.15) &(P<1.0)])


['$1,000,000 Duck (1971)',
 "'Til There Was You (1997)",
 '1-900 (1994)',
 '24 7: Twenty Four Seven (1997)',
 '24-hour Woman (1998)',
 '3 Ninjas: High Noon On Mega Mountain (1998)',
 '3 Strikes (2000)',
 '301, 302 (1995)',
 '42 Up (1998)',
 'A Chef in Love (1996)',
 'Abominable Snowman, The (1957)',
 'About Adam (2000)',
 'Above the Rim (1994)',
 'Acid House, The (1998)',
 'Across the Sea of Time (1995)',
 "Actor's Revenge, An (Yukinojo Henge) (1963)",
 'Adrenalin: Fear the Rush (1996)',
 'Adventures of Elmo in Grouchland, The (1999)',
 'Adventures of Sebastian Cole, The (1998)',
 'Affair of Love, An (Une Liaison Pornographique) (1999)',
 'Afterglow (1997)',
 'Agnes Browne (1999)',
 'Aim�e & Jaguar (1999)',
 'Aiqing wansui (1994)',
 'Alan Smithee Film: Burn Hollywood Burn, An (1997)',
 'Alarmist, The (1997)',
 'Alaska (1996)',
 'Algiers (1938)',
 'Alice and Martin (Alice et Martin) (1998)',
 'Alien Escape (1995)',
 'All Over Me (1997)',
 'All Things Fair (1996)',
 "All the Rage (a.k.a.

In [19]:
def get_movie_correlations(movie_title):  
    '''Devuelve el vector de correlación para una determinada pelicula'''
    movie_index = list(movies_index).index(movie_title)
    return correlation_matrix[movie_index]

def get_similar_movies(movie_title, threshold=0.15):
    '''Devuelve peliculas similares tomando como umbral 0.15'''
    movie_correlations_array =  get_movie_correlations(movie_title)
    return movies_index[movie_correlations_array>threshold]

In [20]:
get_similar_movies('Mad Max (1979)')

Index(['Mad Max (1979)', 'Mad Max 2 (a.k.a. The Road Warrior) (1981)',
       'Mad Max Beyond Thunderdome (1985)', 'Terminator, The (1984)'],
      dtype='object', name='movie_title')

In [21]:
get_similar_movies('Puppet Master (1989)')

Index(['Children of the Corn IV: The Gathering (1996)',
       'Children of the Damned (1963)', 'Creepshow 2 (1987)',
       'Fright Night Part II (1989)', 'Hellraiser: Bloodline (1996)',
       'Phantasm III: Lord of the Dead (1994)',
       'Poltergeist II: The Other Side (1986)', 'Puppet Master (1989)',
       'Puppet Master 4 (1993)', 'Puppet Master 5: The Final Chapter (1994)',
       'Puppet Master II (1990)', 'Puppet Master III: Toulon's Revenge (1991)',
       'Robert A. Heinlein's The Puppet Masters (1994)'],
      dtype='object', name='movie_title')

#### Obtenemos la lista de películas que el usuario ha puntuado, 
### Creamos una matriz vacía que la rellenamos con cero de longitud el número de películas de la matriz de correlación
#### Para cada película de las películas del usuario, sumamos las correlaciones de esa película 
### Con la matriz que acabamos de crear. Luego ordenamos las similitudes resultantes por orden descendente.

In [22]:
def get_movie_recommendations(user_movies):  
    '''dada una lista de peliculas, devuelve todas las películas similares ordenadas por el índice de correlación'''
    movie_similarities = np.zeros(correlation_matrix.shape[0])
    for movie_id in user_movies:
        movie_similarities = movie_similarities + get_movie_correlations(movie_id)
    similar_movies = pd.DataFrame({
        'movie_title': movies_index,
        'sum_similarity': movie_similarities
        })
    similar_movies = similar_movies[~(similar_movies.movie_title.isin(user_movies))]
    similar_movies = similar_movies.sort_values(by=['sum_similarity'], ascending=False)
    return similar_movies

In [23]:
#obtenemos las recomendaciones que ha realizado un usuario
id_user = 2
ratings[ratings.user_id==id_user].sort_values(by=['rating'], ascending=False)

Unnamed: 0,user_id,movie_title,movie_id,rating,movie_genre
1,2,One Flew Over the Cuckoo's Nest (1975),1193,5,Drama
126902,2,Dances with Wolves (1990),590,5,Adventure|Drama|Western
87704,2,Stand by Me (1986),1259,5,Adventure|Comedy|Drama
89489,2,"Green Mile, The (1999)",3147,5,Drama|Thriller
91966,2,Gandhi (1982),1293,5,Drama
96090,2,Braveheart (1995),110,5,Action|Drama|War
100605,2,Close Encounters of the Third Kind (1977),3471,5,Drama|Sci-Fi
106150,2,On the Waterfront (1954),1945,5,Crime|Drama
110437,2,Amadeus (1984),1225,5,Drama
111819,2,"Remains of the Day, The (1993)",515,5,Drama


In [None]:
#obtenemos las recomendaciones para un usuario,

#obtener las peliculas puntuadas por un determinado usuario
id_user = 2
user_movies = ratings[ratings.user_id==id_user].sort_values(by=['rating'], ascending=False).movie_title.tolist() 
user_movies

["One Flew Over the Cuckoo's Nest (1975)",
 'Dances with Wolves (1990)',
 'Stand by Me (1986)',
 'Green Mile, The (1999)',
 'Gandhi (1982)',
 'Braveheart (1995)',
 'Close Encounters of the Third Kind (1977)',
 'On the Waterfront (1954)',
 'Amadeus (1984)',
 'Remains of the Day, The (1993)',
 'Jurassic Park (1993)',
 'Doctor Zhivago (1965)',
 'Die Hard 2 (1990)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Hunt for Red October, The (1990)',
 'Silence of the Lambs, The (1991)',
 'Rocky (1976)',
 'On Golden Pond (1981)',
 'Chariots of Fire (1981)',
 'True Lies (1994)',
 'October Sky (1999)',
 'Gone with the Wind (1939)',
 'Lethal Weapon 3 (1992)',
 'As Good As It Gets (1997)',
 'Shawshank Redemption, The (1994)',
 'Forrest Gump (1994)',
 'Simon Birch (1998)',
 'Graduate, The (1967)',
 'Few Good Men, A (1992)',
 'Shine (1996)',
 'Dead Poets Society (1989)',
 'Driving Miss Daisy (1989)',
 'Gladiator (2000)',
 'Hustler, The (1961)',
 'Shakespeare in Love (1998)',
 'Star Wars:

In [None]:
#obtener recomendaciones a partir de las peliculas favoritas de un usuario
recommendations = get_movie_recommendations(user_movies)

recommendations.movie_title.head

<bound method NDFrame.head of 1915                         Lethal Weapon (1987)
2814                             Rock, The (1996)
1916                       Lethal Weapon 2 (1989)
3394                               Top Gun (1986)
929                               Die Hard (1988)
1378                 Good Morning, Vietnam (1987)
80                           Air Force One (1997)
3113                                 Speed (1994)
1052                                Eraser (1996)
376                                    Big (1988)
1320                                 Ghost (1990)
301                                 Batman (1989)
1007            E.T. the Extra-Terrestrial (1982)
1684    Indiana Jones and the Last Crusade (1989)
195                              Apollo 13 (1995)
2725                                Ransom (1996)
2141                          Men in Black (1997)
2714                              Rain Man (1988)
1371                             GoldenEye (1995)
2879                