## Systemy rekomendacji
1. Zbiorowej filtracji  
2. Filtracji treści

### Przygotowanie danych

In [1]:
import pandas as pd
missing_values = ['na','--','?','-','None','none','non','n/a']
movies = pd.read_csv("DF_st/ml-latest-small/movies.csv",na_values=missing_values)
ratings = pd.read_csv("DF_st/ml-latest-small/ratings.csv",na_values=missing_values)

In [2]:
# Połączenie movies z rating po movieId (nazwa: movies_final)
movies_final = pd.merge(ratings, movies, on='movieId')
movies_final

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [3]:
# stworzenie macierzy rzadkiej 
# 1. rows: movieId, columns: userId, values: ratings 
# 2. macierz rzadka 

movie_features = ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
movie_features #df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
from scipy.sparse import csr_matrix
mat_movie_features = csr_matrix(movie_features.values)
mat_movie_features

<9724x610 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

### 1. Zbiorowa filtracja

#### 1a. Podejście oparte na pamięci, Item2Item (z algorytmem kNN)

In [5]:
#model
from sklearn.neighbors import NearestNeighbors 
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1)
model_knn.fit(mat_movie_features)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1)

In [6]:
#rekomendacja
import numpy as np
query_index = np.random.choice(movie_features.shape[0])
distances, indices = model_knn.kneighbors(movie_features.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 10)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print ('Recommendations for {0}:\n'.format(movie_features.index[query_index]))
    else:
        print ('{0}: {1}, with distance of {2}:'.format(i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for 2871:

1: 1663, with distance of 0.4856971491405485:
2: 2420, with distance of 0.5143974653420524:
3: 2110, with distance of 0.5152419408727082:
4: 1129, with distance of 0.5195217209156638:
5: 3698, with distance of 0.531319081129253:
6: 2641, with distance of 0.5345534316655983:
7: 2989, with distance of 0.5467002062767286:
8: 2668, with distance of 0.5488252183638176:
9: 2533, with distance of 0.5502757780276801:


In [7]:
from fuzzywuzzy import fuzz

def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    verbose: bool, print log if True
    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 70:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie
    Parameters
    ----------
    model_knn: sklearn model, knn model
    data: movie-user matrix
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    n_recommendations: int, top n recommendations
    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('Selected movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [8]:
# tworzymy mapper: movie title -> index
movie_to_idx = {
    movie: i for i, movie in
    enumerate(list(movies.set_index('movieId').loc[movie_features.index].title))
}

In [9]:
my_favorite = 'Pulp Fictiom'

make_recommendation(
    model_knn=model_knn,
    data=mat_movie_features,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=5)

Selected movie: Pulp Fictiom
Found possible matches in our database: ['Pulp Fiction (1994)']

Recommendation system start to make inference
......

Recommendations for Pulp Fictiom:
1: Usual Suspects, The (1995), with distance of 0.3273840967732101
2: Forrest Gump (1994), with distance of 0.3144563273687092
3: Seven (a.k.a. Se7en) (1995), with distance of 0.3023463819521529
4: Shawshank Redemption, The (1994), with distance of 0.2976337826022786
5: Silence of the Lambs, The (1991), with distance of 0.2906176295563828


### 1b. Podejście oparte na modelu (SVD)

In [10]:
# tworzymy macierz rzadką (z tytułami w kolumnach)
from scipy.sparse import csr_matrix
movie_features2 = movies_final.pivot_table(
    index='userId',
    columns='title',
    values='rating'
).fillna(0)
movie_features2

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# transpozycja
X = movie_features2.values.T
type(X)

numpy.ndarray

In [49]:
# dekompozycja macierzy
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components = 10, random_state=12)
matrix = SVD.fit_transform(X)
matrix

array([[ 5.55459727e-01,  8.08734418e-01, -3.70793965e-01, ...,
        -8.00225664e-01, -8.42661961e-01,  2.42123598e-01],
       [ 2.34857544e-01,  4.86889653e-02,  2.83475683e-02, ...,
         4.97851060e-02, -9.49132170e-02, -4.72952424e-02],
       [ 2.51207089e-01, -4.87102852e-03, -1.36790265e-02, ...,
        -4.17391918e-02, -1.07263521e-01, -4.35679625e-03],
       ...,
       [ 9.35396268e-01,  9.31479738e-01, -1.29421918e-02, ...,
        -3.79982363e-01, -3.95640490e-01,  5.22502851e-01],
       [ 6.50333281e+00, -2.71447567e+00, -2.10390579e+00, ...,
         1.73805046e+00,  2.12883255e+00,  2.58937792e+00],
       [ 3.13533559e-02, -4.36113440e-02,  3.32840985e-03, ...,
        -2.03026447e-02, -2.58252139e-02,  5.07842163e-03]])

In [13]:
# analiza korelacji między filmami
corr = np.corrcoef(matrix)
corr.shape

(9719, 9719)

In [14]:
# z tabeli wyjściowej wyciągamy listę tytulów filmów
movie_names = movie_features2.columns
movies_list = list(movie_names)
movies_list

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)',
 '...All the Marbles (1981)',
 '...And Justice for All (1979)',
 '00 Schneider - Jagd auf Nihil Baxter (1994)',
 '1-900 (06) (1994)',
 '10 (1979)',
 '10 Cent Pistol (2015)',
 '10 Cloverfield Lane (2016)',
 '10 Items or Less (2006)',
 '10 Things I Hate About You (1999)',
 '10 Years (2011)',
 '10,000 BC (2008)',
 '100 Girls (2000)',
 '100 Streets (2016)',
 '101 Dalmatians (1996)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 "101 Dalmatians II: Patch's London Adventure (2003)",
 '101 Reykjavik (101 Reykjavík) (2000)',
 '102 Dalmatians (2000)',
 '10th & Wolf (2006)',
 '10th Kingdom, The (2000)',
 '10th Victim, The (La decima vittima) (1965)',
 '11\'09"01 - September 11 (2002)',
 '11:14 (2

In [15]:
# wyciągamy jeden film, by zobaczyć w kolejnym etapie do jakich filmów jest podobny
selected_film = movies_list.index('American Beauty (1999)')
selected_film

405

In [50]:
corr_selected_film = corr[selected_film]
corr_selected_film

array([0.21962356, 0.68142802, 0.76257211, ..., 0.2904888 , 0.57126283,
       0.02769168])

In [51]:
list(movie_names[(corr_selected_film<1.0)&(corr_selected_film>0.9)])

['Being John Malkovich (1999)',
 'Clockwork Orange, A (1971)',
 'Eyes Wide Shut (1999)',
 'Magnolia (1999)',
 'Reservoir Dogs (1992)',
 'Run Lola Run (Lola rennt) (1998)',
 'Trainspotting (1996)']

### 2. Filtracja treści (jakie filmy możesz jeszcze polubić, na podstawie tego co już lubisz)

In [18]:
# rozdzielanie title i year
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
movies['title'] = movies['title'].apply(lambda x: x.strip())

In [19]:
movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017
9739,193585,Flint,Drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018


In [20]:
movies.isna().sum()
movies.year.fillna(0, inplace=True)

In [21]:
#kluczowa jest kolumna genres
movies['genres'] = movies.genres.str.split('|')
movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017
9738,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017
9739,193585,Flint,[Drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018


In [22]:
movies_genres = movies.copy(deep=True)
x = []
for index, row in movies.iterrows():
    x.append(index)
    for genre in row['genres']:
        movies_genres.at[index, genre] = 1

In [23]:
movies_genres = movies_genres.fillna(0)
movies_genres

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,193585,Flint,[Drama],2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Utworzenie swojego profilu
Ja = [
{'title':'Godfather, The', 'rating':3.5},
{'title':'Pulp Fiction', 'rating':5}
]
Ja_movies = pd.DataFrame(Ja)
Ja_movies

Unnamed: 0,title,rating
0,"Godfather, The",3.5
1,Pulp Fiction,5.0


In [25]:
# Łączymy swoj profil z movieId 
Ja_moviesId = movies_genres[movies_genres['title'].isin(Ja_movies['title'].tolist())]
Ja_moviesId

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
257,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
659,858,"Godfather, The","[Crime, Drama]",1972,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
Ja_genres = pd.merge(Ja_movies, Ja_moviesId)
Ja_genres

Unnamed: 0,title,rating,movieId,genres,year,Adventure,Animation,Children,Comedy,Fantasy,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,"Godfather, The",3.5,858,"[Crime, Drama]",1972,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Pulp Fiction,5.0,296,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
Ja_genres_only = Ja_genres.drop(['genres','year','title','rating','movieId'], 1)
Ja_genres_only

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
print('Shape of profile genres_only is:',Ja_genres_only.shape)
print('Shape of profile movies is:',Ja_genres.shape)

Shape of profile genres_only is: (2, 20)
Shape of profile movies is: (2, 25)


In [52]:
moj_profil = Ja_genres_only.T.dot(Ja_genres.rating)
moj_profil

Adventure             0.0
Animation             0.0
Children              0.0
Comedy                5.0
Fantasy               0.0
Romance               0.0
Drama                 8.5
Action                0.0
Crime                 8.5
Thriller              5.0
Horror                0.0
Mystery               0.0
Sci-Fi                0.0
War                   0.0
Musical               0.0
Documentary           0.0
IMAX                  0.0
Western               0.0
Film-Noir             0.0
(no genres listed)    0.0
dtype: float64

In [30]:
movies_genres = movies_genres.set_index(movies_genres.movieId)
movies_genres

Unnamed: 0_level_0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,193585,Flint,[Drama],2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
movies_genres.drop(['movieId','title','genres','year'], axis=1, inplace=True)
movies_genres.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
recommendation = (movies_genres.dot(moj_profil)) / moj_profil.sum()
recommendation

movieId
1         0.185185
2         0.000000
3         0.185185
4         0.500000
5         0.185185
            ...   
193581    0.185185
193583    0.185185
193585    0.314815
193587    0.000000
193609    0.185185
Length: 9742, dtype: float64

In [33]:
recommendation.sort_values(ascending=False, inplace=True)

In [34]:
copy = movies.copy(deep=True)
copy = copy.set_index('movieId', drop=True) 
top_20_index = recommendation.index[:10].tolist() 
top_20_index

[1912, 81132, 1432, 5628, 3893, 20, 608, 6705, 27674, 57669]

In [35]:
recommended_movies = copy.loc[top_20_index, :]

In [36]:
recommended_movies

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1912,Out of Sight,"[Comedy, Crime, Drama, Romance, Thriller]",1998
81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
1432,Metro,"[Action, Comedy, Crime, Drama, Thriller]",1997
5628,Wasabi,"[Action, Comedy, Crime, Drama, Thriller]",2001
3893,Nurse Betty,"[Comedy, Crime, Drama, Romance, Thriller]",2000
20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]",1995
608,Fargo,"[Comedy, Crime, Drama, Thriller]",1996
6705,Party Monster,"[Comedy, Crime, Drama, Thriller]",2003
27674,11:14,"[Comedy, Crime, Drama, Mystery, Thriller]",2003
57669,In Bruges,"[Comedy, Crime, Drama, Thriller]",2008


### ALS

In [37]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MovieLens').getOrCreate()

In [55]:
movies = spark.read.csv("DF_st/ml-latest-small/movies.csv",header=True)
ratings = spark.read.csv("DF_st/ml-latest-small/ratings.csv",header=True)

In [40]:
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [41]:
from pyspark.sql.types import *
from pyspark.sql.functions import col

ratings = ratings.drop('timestamp').withColumn("userId", col("userId").cast(IntegerType()))\
                                      .withColumn("movieId", col("movieId").cast(IntegerType()))\
                                      .withColumn("rating", col("rating").cast(DoubleType()))

movies = movies.withColumn("movieId", col("movieId").cast(IntegerType()))

In [42]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [43]:
# dzielimy zbiór na treningowy i testowy (0.7 i 0.3)
(ratings_train, ratings_test) = ratings.randomSplit([0.7,0.3], seed = 1000)

print(' training: {0}\n test: {1}\n'\
  .format(ratings_train.count(),
          ratings_test.count()
         ))

 training: 70540
 test: 30296



In [44]:
from pyspark.ml.recommendation import ALS
#https://spark.apache.org/docs/2.1.0/ml-collaborative-filtering.html
ratings_als = ALS(userCol = "userId",
                  itemCol = "movieId",
                  ratingCol = "rating",
                  rank = 10, #latent components
                  maxIter = 10,
                  regParam = 0.1, #regularization
                  implicitPrefs = False, 
                  coldStartStrategy = "drop", 
                  )

In [45]:
#predykcje
from pyspark.ml.evaluation import RegressionEvaluator
ratings_model = ratings_als.fit(ratings_train)

ratings_pred = ratings_model.transform(ratings_test)
ratings_pred.show(10)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   362|   1645|   5.0| 3.5550983|
|   597|   1959|   4.0|  3.861864|
|    34|   1580|   2.5| 3.7310483|
|   368|   1580|   3.0|  2.923243|
|   368|   3918|   2.0| 1.6487323|
|    28|   1645|   2.5| 3.1952186|
|    27|   1580|   3.0|  3.471271|
|    27|   2142|   3.0|  2.897929|
|   332|   1645|   3.5|  2.819389|
|   577|   1959|   4.0| 3.2291894|
+------+-------+------+----------+
only showing top 10 rows



In [46]:
#ocena jakości
from pyspark.ml.evaluation import RegressionEvaluator
ratings_eval = RegressionEvaluator(metricName = "rmse", 
                                    labelCol = "rating",
                                    predictionCol = "prediction")
ratings_rmse = ratings_eval.evaluate(ratings_pred)
print("RMSE = " + str(ratings_rmse))

RMSE = 0.90223866137914


In [47]:
ratings_pred.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   362|   1645|   5.0| 3.5550983|
|   597|   1959|   4.0|  3.861864|
|    34|   1580|   2.5| 3.7310483|
|   368|   1580|   3.0|  2.923243|
|   368|   3918|   2.0| 1.6487323|
|    28|   1645|   2.5| 3.1952186|
|    27|   1580|   3.0|  3.471271|
|    27|   2142|   3.0|  2.897929|
|   332|   1645|   3.5|  2.819389|
|   577|   1959|   4.0| 3.2291894|
|   577|   2366|   3.0| 3.3609138|
|   384|   1959|   4.0| 3.4827023|
|   159|   1088|   4.0|  2.637545|
|   606|   1645|   3.5| 3.1825113|
|   606|   1829|   3.5| 2.1991036|
|   223|   1342|   1.0| 2.0416958|
|   602|    471|   4.0|  3.305328|
|   330|   1580|   4.0| 3.1064224|
|   372|    471|   3.0| 2.8834252|
|   122|   3175|   4.5| 4.1901436|
+------+-------+------+----------+
only showing top 20 rows



In [48]:
# rekomendacja 
user_recs=ratings_model.recommendForAllUsers(5).show(truncate=False)

+------+------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                       |
+------+------------------------------------------------------------------------------------------------------+
|1     |[{177593, 5.7812905}, {3379, 5.6521482}, {971, 5.6396947}, {3451, 5.567501}, {2563, 5.538487}]        |
|2     |[{131724, 4.887288}, {7842, 4.7945824}, {7841, 4.594096}, {60943, 4.594096}, {80906, 4.4865355}]      |
|3     |[{70946, 5.09276}, {5746, 4.8968515}, {6835, 4.8968515}, {5181, 4.8968515}, {4518, 4.8968515}]        |
|4     |[{82, 5.265686}, {89904, 5.0828247}, {7700, 5.043691}, {1262, 4.949155}, {1046, 4.9397445}]           |
|5     |[{177593, 4.881306}, {1207, 4.7950826}, {82, 4.7709575}, {3677, 4.7587695}, {1262, 4.7553368}]        |
|6     |[{26133, 5.0361314}, {42730, 5.0273943}, {4857, 4.984055}, {2563, 4.8659825}, {85774, 4.792117}]