# INF391 - Tarea 9
El objetivo de esta tarea es construir un sistema de recomendación basado en contenido usando procesamiento del lenguaje natural.
- El conjunto de datos son las 250 películas mejor ranqueadas de IMDB.
- Las recomendaciones estarán basadas en información como directores, actores, género y descripción de la película.
- Realizar una limpieza de los datos para considerar solo las palabras más relevantes de la descripción de la película.
- Vectorizar cada película y calcular su similaridad con el resto.
- La entrada será el título de una película y la salida debe ser una lista con las 10 más similares (Top-10).
- ¿Cómo cambian las recomendaciones si están basadas únicamente en el título de la película?
- ¿Alguna otra *feature* del conjunto original sería interesante incluir en el análisis?

In [1]:
import pandas as pd
df = pd.read_csv('IMDB_Top250.csv')

In [2]:
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [3]:
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Limpieza dataframe
def limpiar_df(df):
    ## A lower case y eliminar puntuación
    ## (Dejar nombres capitalizados)
    df['Title-clean'] = df['Title'].str.lower().str.replace('[,.:]', '', regex=True)
    df['Genre'] = df['Genre'].str.lower().str.replace(',', '', regex=True)
    df['Plot'] = df['Plot'].str.lower().str.replace('[,.:]', '', regex=True)
    df['Actors'] = df['Actors'].str.replace('[,.:]', '', regex=True)
    df['Director'] = df['Director'].str.replace('[,.:]', '', regex=True)
    ## Eliminar stopwords de plot
    sw = stopwords.words('english')
    df['Title-clean'] = df['Title-clean'].apply(lambda x: ' '.join([palabra for palabra in x.split() if palabra not in sw]))
    df['Plot'] = df['Plot'].apply(lambda x: ' '.join([palabra for palabra in x.split() if palabra not in sw]))
    # Agregar columna ID
    df['id'] = df.index
    
# Unir contenido en un string
# lista de features
def componerContenido(df, features):
    df_contenido = pd.DataFrame(index=df.index)
    df_contenido['id'] = df['id']
    # Concatenar
    total = ""
    for feat in features:
        parcial = df[feat]
        total += " "+parcial
    df_contenido['contenido'] = total
    df_contenido['contenido'].str.strip()
    return df_contenido

# Proceso de lemmatización
lemmatizer = WordNetLemmatizer()
lematizar = lambda string: lemmatizer.lemmatize(string)
    
def lemmatizarLista(lista):
    #print("ANTES: ",lista)
    lista=list(map(lematizar,lista))
    #print("DESPUES: ",lista)
    return lista

def lemmatizarContenido(df):
    df['lemma'] = df['contenido'].str.split().apply(lambda x: lemmatizarLista(x))
    
limpiar_df(df)
display(df.head())
features = ['Title', 'Genre', 'Director', 'Actors', 'Plot']
contenido = componerContenido(df, features)
lemmatizarContenido(contenido)
display(contenido.head())
palabras = sum(contenido['lemma'].tolist(), [])
palabras = list(set(palabras))
palabras

D = df.shape[0]
n_palabras = len(palabras)
frecuencias = np.zeros((n_palabras, D))
idf = np.zeros((n_palabras))
i = 0
for pal in palabras:
    D_actual = 0
    for j in range(250):
        freq = contenido['lemma'][j].count(pal)
        frecuencias[i][j] = freq
        if freq > 0:
            D_actual += 1
    idf[i] = np.log2(D/D_actual)
    i += 1
display(frecuencias)
display(idf)

tf = np.copy(frecuencias)
i = 0
for pal in palabras:
    for j in range(250):
        tf_j = frecuencias[i][j]
        tf_doc = np.sum(frecuencias[:][j])
        tf[i][j] = tf_j / tf_doc
    i += 1
display(tf)


Unnamed: 0,Title,Genre,Director,Actors,Plot,Title-clean,id
0,The Shawshank Redemption,crime drama,Frank Darabont,Tim Robbins Morgan Freeman Bob Gunton William ...,two imprisoned men bond number years finding s...,shawshank redemption,0
1,The Godfather,crime drama,Francis Ford Coppola,Marlon Brando Al Pacino James Caan Richard S C...,aging patriarch organized crime dynasty transf...,godfather,1
2,The Godfather: Part II,crime drama,Francis Ford Coppola,Al Pacino Robert Duvall Diane Keaton Robert De...,early life career vito corleone 1920s new york...,godfather part ii,2
3,The Dark Knight,action crime drama,Christopher Nolan,Christian Bale Heath Ledger Aaron Eckhart Mich...,menace known joker emerges mysterious past wre...,dark knight,3
4,12 Angry Men,crime drama,Sidney Lumet,Martin Balsam John Fiedler Lee J Cobb EG Marshall,jury holdout attempts prevent miscarriage just...,12 angry men,4


Unnamed: 0,id,contenido,lemma
0,0,The Shawshank Redemption crime drama Frank Da...,"[The, Shawshank, Redemption, crime, drama, Fra..."
1,1,The Godfather crime drama Francis Ford Coppol...,"[The, Godfather, crime, drama, Francis, Ford, ..."
2,2,The Godfather: Part II crime drama Francis Fo...,"[The, Godfather:, Part, II, crime, drama, Fran..."
3,3,The Dark Knight action crime drama Christophe...,"[The, Dark, Knight, action, crime, drama, Chri..."
4,4,12 Angry Men crime drama Sidney Lumet Martin ...,"[12, Angry, Men, crime, drama, Sidney, Lumet, ..."


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

array([7.96578428, 7.96578428, 7.96578428, ..., 7.96578428, 7.96578428,
       5.96578428])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
tf_idf = (tf.T * idf).T
display(tf_idf)
similitud = cosine_similarity(tf_idf.T)
print(similitud)
similitud_sin_diag = similitud - np.eye(250)
print(similitud_sin_diag)
df_similitud = pd.DataFrame(similitud_sin_diag)
display(df_similitud)
df_similitud[103]#.argmax()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

[[1.00000000e+00 1.12517436e-02 1.01515094e-02 ... 2.60040464e-04
  3.26889255e-03 2.29727216e-04]
 [1.12517436e-02 1.00000000e+00 1.81322601e-01 ... 2.75429873e-04
  3.46234830e-03 2.43322661e-04]
 [1.01515094e-02 1.81322601e-01 1.00000000e+00 ... 2.48497393e-04
  3.12378798e-03 2.19529735e-04]
 ...
 [2.60040464e-04 2.75429873e-04 2.48497393e-04 ... 1.00000000e+00
  2.64770447e-04 2.25937516e-04]
 [3.26889255e-03 3.46234830e-03 3.12378798e-03 ... 2.64770447e-04
  1.00000000e+00 2.33905818e-04]
 [2.29727216e-04 2.43322661e-04 2.19529735e-04 ... 2.25937516e-04
  2.33905818e-04 1.00000000e+00]]
[[-3.33066907e-16  1.12517436e-02  1.01515094e-02 ...  2.60040464e-04
   3.26889255e-03  2.29727216e-04]
 [ 1.12517436e-02  0.00000000e+00  1.81322601e-01 ...  2.75429873e-04
   3.46234830e-03  2.43322661e-04]
 [ 1.01515094e-02  1.81322601e-01  0.00000000e+00 ...  2.48497393e-04
   3.12378798e-03  2.19529735e-04]
 ...
 [ 2.60040464e-04  2.75429873e-04  2.48497393e-04 ... -3.33066907e-16
   2.64770

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,-3.330669e-16,0.011252,0.010152,0.005978,0.003966,0.000252,0.032139,0.106044,0.000212,0.005526,...,0.018686,0.016782,0.023904,0.022051,0.000299,3.455735e-03,0.000224,2.600405e-04,0.003269,0.000230
1,1.125174e-02,0.000000,0.181323,0.009725,0.008122,0.000267,0.005733,0.008002,0.000225,0.005854,...,0.000000,0.036187,0.000000,0.021919,0.000316,3.660248e-03,0.000238,2.754299e-04,0.003462,0.000243
2,1.015151e-02,0.181323,0.000000,0.008774,0.007328,0.000241,0.005173,0.016083,0.007436,0.005281,...,0.000000,0.018054,0.000000,0.029751,0.045001,1.299696e-02,0.000214,2.484974e-04,0.003124,0.000220
3,5.978339e-03,0.009725,0.008774,0.000000,0.003428,0.000218,0.016487,0.003377,0.012968,0.034465,...,0.013678,0.021805,0.000000,0.000253,0.010506,2.986875e-03,0.000194,2.247593e-04,0.002825,0.000199
4,3.966083e-03,0.008122,0.007328,0.003428,0.000000,0.000252,0.000232,0.010657,0.000212,0.000237,...,0.000000,0.000000,0.006635,0.000293,0.028187,2.842691e-04,0.035506,2.597390e-04,0.000269,0.000229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,3.455735e-03,0.003660,0.012997,0.002987,0.000284,0.000271,0.005826,0.010263,0.008375,0.022707,...,0.000000,0.003249,0.000000,0.031094,0.000321,-2.220446e-16,0.000241,1.864088e-02,0.036833,0.000247
246,2.242607e-04,0.000238,0.000214,0.000194,0.035506,0.010605,0.009777,0.005956,0.000180,0.000201,...,0.000000,0.000000,0.005634,0.000248,0.012556,2.413912e-04,0.000000,2.205611e-04,0.000228,0.000195
247,2.600405e-04,0.000275,0.000248,0.000225,0.000260,0.025765,0.000229,0.000256,0.000209,0.015545,...,0.004645,0.000000,0.037781,0.000288,0.000294,1.864088e-02,0.000221,-3.330669e-16,0.000265,0.000226
248,3.268893e-03,0.003462,0.003124,0.002825,0.000269,0.009099,0.017012,0.000265,0.010720,0.028228,...,0.000000,0.003074,0.029919,0.000298,0.010773,3.683308e-02,0.000228,2.647704e-04,0.000000,0.000234


0      0.000000
1      0.011148
2      0.016677
3      0.000000
4      0.000000
         ...   
245    0.007949
246    0.000000
247    0.003827
248    0.000000
249    0.000000
Name: 103, Length: 250, dtype: float64

In [43]:
# Armar lista con las top N películas dado un título dentro del listado de las top 250 de imdb
def topN_peliculas(df_peliculas, pelicula, similitud, Ntop=10):
    size = similitud.shape[0]
    # Quitar diagonal (1 siempre es el más alto)
    similitud_sin_diag = similitud - np.eye(size)
    # Obtener id según el título de la película
    index_pelicula = df_peliculas[df_peliculas['Title'] == pelicula]['id'].item()
    top_peliculas = []
    for i in range(Ntop):
        # Calcular máximo
        index_top = similitud_sin_diag[index_pelicula].argmax()
        # Quitar top de lista actual
        similitud_sin_diag[index_pelicula, index_top] = -1
        # Extraer pelicula top
        pelicula_top = df_peliculas[(df_peliculas['id'] == index_top)]['Title'].item()
        top_peliculas.append(pelicula_top)
    return top_peliculas

titulo = "Pulp Fiction"
topN_peliculas(df, pelicula=titulo, similitud=similitud)

['Reservoir Dogs',
 'The Shawshank Redemption',
 'Inglourious Basterds',
 'Some Like It Hot',
 'Django Unchained',
 'Goodfellas',
 'Rope',
 'Raging Bull',
 'Sin City',
 'A Beautiful Mind']

In [7]:
df

Unnamed: 0,Title,Genre,Director,Actors,Plot,Title-clean,id
0,The Shawshank Redemption,crime drama,Frank Darabont,Tim Robbins Morgan Freeman Bob Gunton William ...,two imprisoned men bond number years finding s...,shawshank redemption,0
1,The Godfather,crime drama,Francis Ford Coppola,Marlon Brando Al Pacino James Caan Richard S C...,aging patriarch organized crime dynasty transf...,godfather,1
2,The Godfather: Part II,crime drama,Francis Ford Coppola,Al Pacino Robert Duvall Diane Keaton Robert De...,early life career vito corleone 1920s new york...,godfather part ii,2
3,The Dark Knight,action crime drama,Christopher Nolan,Christian Bale Heath Ledger Aaron Eckhart Mich...,menace known joker emerges mysterious past wre...,dark knight,3
4,12 Angry Men,crime drama,Sidney Lumet,Martin Balsam John Fiedler Lee J Cobb EG Marshall,jury holdout attempts prevent miscarriage just...,12 angry men,4
...,...,...,...,...,...,...,...
245,The Lost Weekend,drama film-noir,Billy Wilder,Ray Milland Jane Wyman Phillip Terry Howard Da...,desperate life chronic alcoholic followed four...,lost weekend,245
246,Short Term 12,drama,Destin Daniel Cretton,Brie Larson John Gallagher Jr Stephanie Beatri...,20-something supervising staff member resident...,short term 12,246
247,His Girl Friday,comedy drama romance,Howard Hawks,Cary Grant Rosalind Russell Ralph Bellamy Gene...,newspaper editor uses every trick book keep ac...,girl friday,247
248,The Straight Story,biography drama,David Lynch,Sissy Spacek Jane Galloway Heitz Joseph A Carp...,old man makes long journey lawn-mover tractor ...,straight story,248
