 ## Distancia Manhattan
 sean $x, y \in R^n$ definiremos la distancia Manhattan como:
 $$d(x,y)=|x_1 - y_1| + |x_2 - y_2| + \ldots + |x_n - y_n|$$
 su implementación en python sería:

In [1]:
def manhattan(x,y):
    distancia=0
    for key in x:
        if key in y:
            distancia+=abs(x[key] - y[key])
    return distancia

In [1]:
users={"Abraham":
            {'Batman':5,
            'Toy Story':3.5,
            'Jumanji':2,
            "Pulp Fiction":5,
            'El Padrino':4.5,
            "Boda en Boda":4,
            "Cars":4},
      "Veronica":
            {'Batman':1,
            'Toy Story':2,
            'Jumanji':4,
            "Pulp Fiction":3,
            'El Padrino':2,
            "Orgullo y Prejuicio":4,
            "Diario de Noa":5},
      "Cayetano":
            {'Batman':1,
            "Boda en Boda":0,
            'Jumanji':1,
            "Pulp Fiction":2,
            'El Padrino':5,
            "Diario de Noa":1,
            "Cars":2},
      "Margarita":
            {'Batman':5,
            'Toy Story':4,
            'Jumanji':5,
            "Diario de Noa":3,
            "Boda en Boda":2,
            "Orgullo y Prejuicio":1,
            "Cars":5},
      "Miguel":
            {"Boda en Boda":2,
            'Toy Story':1,
            'Jumanji':3,
            "Pulp Fiction":2,
            'El Padrino':1,
            "Orgullo y Prejuicio":4,
            "Cars":3},
      "Sara":
            {'Batman':2,
            'Toy Story':3,
            'Jumanji':1,
            "Pulp Fiction":4,
            'Diario de Noa':4,
            "Orgullo y Prejuicio":3,
            "Cars":1},
      "Jorge":
            {'Batman':1,
            'Diario de Noa':2,
            'Jumanji':2,
            "Pulp Fiction":1,
            'El Padrino':5,
            "Boda en Boda":4,
            "Cars":2},
      "Ana":
            {'Batman':2,
            'Toy Story':2.5,
            'Jumanji':3,
            "Pulp Fiction":5,
            'El Padrino':5,
            "Orgullo y Prejuicio":3,
            "Cars":3}}


In [None]:
users["Ana"]

In [3]:
manhattan(users["Ana"],users["Cayetano"])

7

In [4]:
manhattan(users["Ana"],users["Sara"])

5.5

In [5]:
def Vecino(Usuario,x):
    distancias=[]
    for usuario in x:
        if usuario!=Usuario:
            distancia=manhattan(x[Usuario],x[usuario])
            distancias.append((distancia,usuario))
    distancias.sort()
    return distancias

In [6]:
Vecino("Sara",users)

[(5.5, 'Ana'),
 (7, 'Cayetano'),
 (8, 'Jorge'),
 (8, 'Veronica'),
 (8.5, 'Abraham'),
 (9, 'Miguel'),
 (15, 'Margarita')]

In [12]:
def recomendacion(Usuario,x):
    cercano=Vecino(Usuario,x)[0][1]
    recomendaciones=[]
    puntuacionvecino=x[cercano]
    puntiacionusuario=x[Usuario]
    for pelicula in puntuacionvecino:
        recomendaciones.append((pelicula,puntuacionvecino[pelicula]))
    return sorted(recomendaciones, 
                 key= lambda pelicula: pelicula[1],
                 reverse = True) ## Reverse para que de de mayor puntiacion a menos 

In [10]:
recomendacion("Sara",users)

[('Pulp Fiction', 5),
 ('El Padrino', 5),
 ('Jumanji', 3),
 ('Orgullo y Prejuicio', 3),
 ('Cars', 3),
 ('Toy Story', 2.5),
 ('Batman', 2)]

In [None]:
users["Sara"]

In [None]:
recomendacion("Margarita",users)

Para implementar la función que me generalice:
$$d(x,y)=(\sum (|x_i - y_i|)^r)^{\frac{1}{r}}$$

In [None]:
def generalizado(x,y,r):
    distancia=0
    puntuacioncomun=False
    for key in x:
        if key in y:
            distancia+=pow(abs(x[key] - y[key]),r)
            puntuacioncomun=True
    if puntuacioncomun:
        return pow(distancia,1/r)
    else:
        return 0

In [None]:
generalizado(users["Ana"],users["Cayetano"],2)

In [None]:
def Vecino_G(Usuario,x,r):
    distancias=[]
    for usuario in x:
        if usuario!=Usuario:
            distancia=generalizado(x[Usuario],x[usuario],r)
            distancias.append((distancia,usuario))
    distancias.sort()
    return distancias

In [None]:
Vecino_G("Sara",users,2)

In [None]:
def recomendacion_G(Usuario,x,r):
    cercano=Vecino_G(Usuario,x,r)[0][1]
    recomendaciones=[]
    puntuacionvecino=x[cercano]
    puntiacionusuario=x[Usuario]
    for pelicula in puntuacionvecino:
        recomendaciones.append((pelicula,puntuacionvecino[pelicula]))
    return sorted(recomendaciones, 
                 key= lambda peliculaTuple: peliculaTuple[1],
                 reverse = True) ## Reverse para que de de mayor puntiacion a menos 

In [None]:
recomendacion_G("Margarita",users,2)

In [None]:
for i in range(1,3):
    print(i,")\t",Vecino_G("Sara",users,i),"\n")

## Coeficiente de Pearson

$$r=\frac{\sum{x_i y_i}-\frac{\sum{x_i}\sum{y_i}}{n}}
{\sqrt{\sum{x_i^2}-\frac{(\sum{x_i})^2}{n}}\sqrt{\sum{y_i^2}-\frac{(\sum{y_i})^2}{n}}}$$

In [2]:
import math
def pearson(U1,U2):
    sum_xy=0
    sum_x=0
    sum_y=0
    sum_x2=0
    sum_y2=0
    n=0
    for key in U1:
        if key in U2:
            n+=1
            x=U1[key]
            y=U2[key]
            sum_xy+=x*y
            sum_x+=x
            sum_y+=y
            sum_x2+=x**2
            sum_y2+=y**2
    if n==0:
        return 0
    denonimandor = pow(sum_x2 - sum_x**2/n,1/2)*pow(sum_y2 - sum_y**2/n,0.5)
    numerador = sum_xy -(sum_x*sum_y)/n
    if denonimandor==0:
        return 0
    else:
        return(numerador/denonimandor)

## Coseno
$$cos(u,v)=\frac{u\times v}{||u|| ||v||}=\frac{\sum{u_i v_i}}{\sqrt[2]{u_1^2+\ldots+u_n^2}\sqrt[2]{v_1^2+\ldots+v_n^2}}$$

In [13]:
def coseno(U1,U2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for key in U1:
        if key in U2:
            x = U1[key]; 
            y = U2[key]
            sumxx += x*x
            sumyy += y*y
            sumxy += x*y
    if pow(sumxx*sumyy,0.5)==0:
        return 0
    else:
        return sumxy/pow(sumxx*sumyy,0.5)

In [14]:
coseno(users["Ana"],users["Cayetano"])

0.9163419338230352

In [3]:
pearson(users["Ana"],users["Cayetano"])

0.7257747386024228

### Ejercicio 

In [2]:
import pandas as pd

movies = pd.read_csv("./Data/Peliculas.csv", sep=',')

m_cols = ['movie_id', 'title']  
ratings = pd.read_csv('./Data/Clasificacion.csv', sep=',')

tags = pd.read_csv('./Data/tags.csv', sep=',')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [3]:
ratings = pd.merge(movies, ratings)
tags= pd.merge(movies, tags)

In [4]:
ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [5]:
movieRatings = ratings.pivot_table(index=['userId'],columns=['title'],values='rating')  
movieRatings.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


Sabemos que cada columna contiene todas las calificaciones de los usuarios para una película en particular.
Busquemos todas las calificaciones de los usuarios de la película "Toy Story" y busquemos películas similares. Elegimos esta película porque tiene el mayor número de calificaciones y queremos encontrar la correlación entre las películas que tienen un mayor número de calificaciones.

![](https://www.compramejor.es/wp-content/uploads/2020/12/Toy-Storyok.jpg)

Para encontrar las calificaciones de los usuarios de "Toy Story", 


In [14]:
toyStoryRatings = movieRatings['Toy Story (1995)']
# Correlamos el resto de peliculas (columnas) con la seleccionada (toy story)  
similarMovies = movieRatings.corrwith(toyStoryRatings)
corr_similarMovies = pd.DataFrame(similarMovies, columns=['Correlacion'])
corr_similarMovies = corr_similarMovies.dropna()  
corr_similarMovies.head()

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,Correlacion
title,Unnamed: 1_level_1
"'burbs, The (1989)",0.240563
(500) Days of Summer (2009),0.353833
*batteries not included (1987),-0.427425
10 Cent Pistol (2015),1.0
10 Cloverfield Lane (2016),-0.285732


In [15]:

corr_similarMovies.sort_values('Correlacion',ascending=False).head(10)  

Unnamed: 0_level_0,Correlacion
title,Unnamed: 1_level_1
Land Before Time III: The Time of the Great Giving (1995),1.0
Faster Pussycat! Kill! Kill! (1965),1.0
Amen. (2002),1.0
"Machine Girl, The (Kataude mashin gâru) (2008)",1.0
Waydowntown (2000),1.0
Brigadoon (1954),1.0
Project X (1987),1.0
Imitation of Life (1959),1.0
Terminal Velocity (1994),1.0
Washington Square (1997),1.0


De la salida se puede ver que las películas que tienen alta correlación con "Toy Story" no son muy conocidas. Esto muestra que la correlación por sí sola no es una buena métrica para la similitud porque puede haber un usuario que vio '"Toy Story" y solo otra película y calificó a ambas como 5.
![](https://images-na.ssl-images-amazon.com/images/I/51SbdHjnNkL._SY445_.jpg)
![](https://images.photowall.com/products/59973/faster-pussycat-kill-kill-ii.jpg?h=699&q=85)
![](https://pics.filmaffinity.com/Kataude_mashin_g_ru_The_Machine_Girl_The_One_Armed_Machine_Girl-294916294-large.jpg)
![](https://pics.filmaffinity.com/Am_n-851513422-mmed.jpg)

In [30]:
ratings_mean_count = pd.DataFrame(ratings.groupby('title')['rating'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(ratings.groupby('title')['rating'].count())
corr_similarMovies.join(ratings_mean_count['rating_counts']).sort_values('Correlacion',ascending=False)

Unnamed: 0_level_0,Correlacion,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Land Before Time III: The Time of the Great Giving (1995),1.0,3
Faster Pussycat! Kill! Kill! (1965),1.0,5
Amen. (2002),1.0,3
"Machine Girl, The (Kataude mashin gâru) (2008)",1.0,2
Waydowntown (2000),1.0,3
Brigadoon (1954),1.0,3
Project X (1987),1.0,2
Imitation of Life (1959),1.0,2
Terminal Velocity (1994),1.0,6
Washington Square (1997),1.0,2


In [17]:
import numpy as np  
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})

popularMovies = movieStats['rating']['size'] >= 100


movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:15]  

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Shawshank Redemption, The (1994)",317.0,4.429022
"Godfather, The (1972)",192.0,4.289062
Fight Club (1999),218.0,4.272936
"Godfather: Part II, The (1974)",129.0,4.25969
"Departed, The (2006)",107.0,4.252336
Goodfellas (1990),126.0,4.25
Casablanca (1942),100.0,4.24
"Dark Knight, The (2008)",149.0,4.238255
"Usual Suspects, The (1995)",204.0,4.237745
"Princess Bride, The (1987)",142.0,4.232394


In [31]:

df = movieStats[popularMovies].join(pd.DataFrame(corr_similarMovies, columns=['Correlacion']))

In [32]:
df.sort_values(['Correlacion'], ascending=False)[:15] 

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",Correlacion
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story (1995),215.0,3.92093,1.0
"Incredibles, The (2004)",125.0,3.836,0.643301
Finding Nemo (2003),141.0,3.960993,0.618701
Aladdin (1992),183.0,3.79235,0.611892
"Monsters, Inc. (2001)",132.0,3.871212,0.490231
Mrs. Doubtfire (1993),144.0,3.388889,0.446261
"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",120.0,4.183333,0.438237
American Pie (1999),103.0,3.378641,0.420117
Die Hard: With a Vengeance (1995),144.0,3.555556,0.410939
E.T. the Extra-Terrestrial (1982),122.0,3.766393,0.409216


In [20]:
userRatings = ratings.pivot_table(index=['userId'],columns=['title'],values='rating')  
userRatings.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [21]:
corrMatrix = userRatings.corr(method='pearson', min_periods=100)  
corrMatrix.head()  

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [None]:

myRatings = userRatings.loc[105].dropna()


myRatings  

In [None]:
posiblesSimilares = pd.Series()


for i in range(0, len(myRatings.index)):  
    
   
    sims = corrMatrix[myRatings.index[i]].dropna()
   
    sims = sims.map(lambda x: x * myRatings[i])
    
   
    posiblesSimilares = posiblesSimilares.append(sims)

    posiblesSimilares = posiblesSimilares.groupby(posiblesSimilares.index).sum()

    filtered = posiblesSimilares.drop(myRatings.index,errors='ignore')  
    filtered.head(10) 
    print("Similares a " + myRatings.index[i] , filtered.head(10))