In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

In [3]:
data = pd.read_csv("../../10/tmdb_5000_movies.csv")
# Nastavíme index
data = data.set_index("title")
# Ponecháme v datech pouze sloupce od gen_Action po gen_Western
data_genres = data.loc[:, "gen_Action":"gen_Western"]
data_genres.head()

Unnamed: 0_level_0,gen_Action,gen_Adventure,gen_Animation,gen_Comedy,gen_Crime,gen_Documentary,gen_Drama,gen_Family,gen_Fantasy,gen_Foreign,gen_History,gen_Horror,gen_Music,gen_Mystery,gen_Romance,gen_Science Fiction,gen_TV Movie,gen_Thriller,gen_War,gen_Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Avatar,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Pirates of the Caribbean: At World's End,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spectre,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Dark Knight Rises,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
John Carter,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [4]:
# odfiltrujeme jen zanry patrici k filmu The Fugitive
data_genres.loc["The Fugitive"][data_genres.loc["The Fugitive"]==1]

gen_Action       1.0
gen_Adventure    1.0
gen_Crime        1.0
gen_Mystery      1.0
gen_Thriller     1.0
Name: The Fugitive, dtype: float64

In [11]:
# odfiltrujeme jen zanry patrici k filmu Witness
data_genres.loc["Witness"][data_genres.loc["Witness"]==1]

gen_Crime       1.0
gen_Drama       1.0
gen_Romance     1.0
gen_Thriller    1.0
Name: Witness, dtype: float64

In [5]:
# Výpočet Jaccardova skóre mezi dvěma filmy ("The Fugitive" a "Witness")
# funkce jaccard_score vyžaduje na vstupu dva seznamy hodnot
jaccard_score(data_genres.loc["The Fugitive"], data_genres.loc["Witness"])

0.2857142857142857

In [6]:
# Výpočet Jaccardova skóre mezi dvěma filmy ("The Fugitive" a "Titanic")
jaccard_score(data_genres.loc["The Fugitive"], data_genres.loc["Titanic"])

0.14285714285714285

In [12]:
# Výpočet Jaccardových vzdáleností mezi všemi páry filmů v `data_genres`
# pdist funkce vypočítá vzdálenostní metriku (v tomto případě Jaccardovu vzdálenost) mezi všemi páry řádků v tabulce (Na rozdíl od jaccard_score, ktery vraci skóre mezi dvěma vektory)
jaccard_distances = pdist(data_genres, metric="jaccard")
# Převedení vektorové formy Jaccardových vzdáleností na čtvercovou matici
jaccard_distances = squareform(jaccard_distances)
# Převod Jaccardových vzdáleností na Jaccardovo skóre (podobnost)
jaccard_score_data = 1 - jaccard_distances

In [14]:
# Vytvoření DataFrame z matice Jaccardových skóre
# index a sloupce nastavíme podle indexů původní tabulky `data_genres`
jaccard_score_df = pd.DataFrame(jaccard_score_data, index=data_genres.index, columns=data_genres.index)

In [15]:
# Výběr Jaccardových skóre pro film "The Fugitive" a jejich seřazení podle hodnot od nejvyšší po nejnižší
# Tím získáme filmy, které jsou žánrově nejpodobnější filmu "The Fugitive"
jaccard_score_df.loc["The Fugitive"].sort_values(ascending=False)

title
The Losers                 1.000000
The Fugitive               1.000000
Nancy Drew                 0.833333
The Negotiator             0.833333
xXx: State of the Union    0.833333
                             ...   
Hot Tub Time Machine 2     0.000000
Quills                     0.000000
Invictus                   0.000000
Downfall                   0.000000
My Date with Drew          0.000000
Name: The Fugitive, Length: 4803, dtype: float64

### Bonus

In [66]:
# Převod sloupce 'release_date' na datový typ datetime
data['release_date'] = pd.to_datetime(data['release_date'])

# Extrahování roku z 'release_date' a uložení do nového sloupce 'release_year'
data['release_year'] = data['release_date'].dt.year

# Definice rozmezí pro dekády
bins = [1979, 1989, 1999, 2009, 2019, 2029]

# Definice popisek pro jednotlivé dekády
labels = ['1980s', '1990s', '2000s', '2010s', '2020s']

# Kategorizace roku vydání filmu do dekád a přidání do nového sloupce 'decade'
data['decade'] = pd.cut(data['release_year'], bins=bins, labels=labels)

In [70]:
jaccard_score_df["decade"] = data["decade"]

pokud se někomu líbí spíše starší filmy, pravděpodobně mu bude lepší doporučit nějaký starší film. Upravíme tedy náš systém tak, aby doporučoval pouze filmy ze stejné dekády.

In [72]:
# Získání dekády, do které patří film "The Fugitive"
decade = jaccard_score_df.loc["The Fugitive", "decade"]
# Filtrace filmů, které patří do stejné dekády jako "The Fugitive"
movies_from_decade = jaccard_score_df[jaccard_score_df["decade"] == decade]
# Seřazení filmů z této dekády podle jejich Jaccardova skóre vzhledem k filmu "The Fugitive"
movies_from_decade["The Fugitive"].sort_values(ascending=False).head(20)


title
The Fugitive                       1.000000
The Negotiator                     0.833333
The River Wild                     0.800000
The Corruptor                      0.800000
Switchback                         0.800000
Ronin                              0.800000
The Jackal                         0.800000
Assassins                          0.800000
The Long Kiss Goodnight            0.800000
Dick Tracy                         0.666667
The Net                            0.666667
The Limey                          0.666667
The Adventures of Ford Fairlane    0.666667
The Glimmer Man                    0.666667
RoboCop 3                          0.666667
Smilla's Sense of Snow             0.666667
Lethal Weapon 4                    0.666667
Lethal Weapon 3                    0.666667
8MM                                0.600000
Se7en                              0.600000
Name: The Fugitive, dtype: float64