In [1]:
%%html

<style>    
    @import url("css/custom_styles.css")
</style>

<center>
    <h1>
    Transformation Et Manipulation Des Données<br>
    </h1>
    MovieLens - Système de recommendations de films par regroupement<br>
    <br>
    <b>Jean-Francois Gagnon</b><br>
    <br>
    420-A56<br>
    <br>
</center>

# Introduction

<font class="answer">
    
J'ai choisi la base de données [MovieLens Small](https://tinyurl.com/bdhmcfht). Elle se compose de 100836 notes représentant 9742 films. Elle contient également les métadonnées sur 19 genres et des liens sur [TMDB](https://www.themoviedb.org) qui permettront d'augmemter son contenu.
    
Plus spécifiquement, (Voir https://files.grouplens.org/datasets/movielens/ml-latest-small-README.html et décrire en détails le format)
    
    
    
L'objectif de ce projet est de contruire un système de recommentation de films en utilisant les techniques de regroupement vu dans le cours. Les détails du sytème seront abordés plus loins dans ce notebooks

In [2]:
#
# imports utilitaires
#

%matplotlib inline

import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import requests
import seaborn as sns
import time

from bs4 import BeautifulSoup
from imblearn.under_sampling import RandomUnderSampler
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD

from tqdm.notebook import tqdm

#
# imports faisant partie de nos propres modules
#

import helpers as hlp
import helpers.dataset.MovieLens as mvl
import helpers.WebScraping as scrap
import helpers.Clustering as clstr

from helpers.jupyter import display_html

# Prétraitement

<font class="answer">

Description ici?

In [3]:
#
# parametres configurant nos traitemens
#
configs = hlp.get_configs("config_overrides.json")

#
# obtenir le dataset
#
mvl_dataset = mvl.load(configs.dataset)

## links.csv

<font class="answer">
    
links.csv n'est pas utilisé directement pour le clustering. Cependant, il le sera pour complémenter l'information des autres base de données. Il m'apparait donc imporant de faire un survol rapide.

In [4]:
print("Links", mvl_dataset.links.shape)
print("Head")
display(mvl_dataset.links.head())
clstr.show_na(mvl_dataset.links)
clstr.show_types(mvl_dataset.links)

Links (9742, 3)
Head


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Valeur manquante 8 (0.1%)


Unnamed: 0,movieId,imdbId,tmdbId
624,791,113610,
843,1107,102336,
2141,2851,81454,
3027,4051,56600,
5532,26587,92337,
5854,32600,377059,
6059,40697,105946,
7382,79299,874957,


Types


Unnamed: 0,movieId,imdbId,tmdbId
Type,int64,string[python],float64


<font class="answer">
    
Il manque quelques liens sur [TMDB](https://www.themoviedb.org/) (ce qui explique le type float64). Les liens [IMDB](https://www.imdb.com/) seront par conséquent privélégiés pour fin de web scrapping.

## movies.csv

<font class="answer">
    
Mettre description ici?

In [5]:
print("Movies", mvl_dataset.movies.shape)
print("Head")
display(mvl_dataset.movies.head())
clstr.show_na(mvl_dataset.movies)
clstr.show_types(mvl_dataset.movies)

Movies (9742, 3)
Head


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Valeur manquante 0 (0.0%)
Types


Unnamed: 0,movieId,title,genres
Type,int64,object,object


<font class="answer">

Tel que décris par MovieLens, *title* contient l'année de parution. Nous allons l'extraire afin d'obtenir une nouvelle variable. *title* sera adressé un pleu plus bas.

In [6]:
def split_title_year(title, regex):
    x = regex.search(title)
    if x:
        title = x.group(1)
        year = x.group(2)
        year = int(year) if year else pd.NA
    else:
        year = pd.NA
    
    return pd.Series({"title": title, "year": year})

title_year_re = re.compile(configs.dataset.title_regex, flags=0)        
title_year = mvl_dataset.movies.title.apply(split_title_year, args=(title_year_re,))

#
# validation de l'extraction
#
print("Informations extraites", title_year.shape)
print("Head")
display(title_year.head())
clstr.show_na(title_year)
clstr.show_types(title_year)

Informations extraites (9742, 2)
Head


Unnamed: 0,title,year
0,Toy Story,1995
1,Jumanji,1995
2,Grumpier Old Men,1995
3,Waiting to Exhale,1995
4,Father of the Bride Part II,1995


Valeur manquante 12 (0.1%)


Unnamed: 0,title,year
6059,Babylon 5,
9031,Ready Player One,
9091,Hyena Road,
9138,The Adventures of Sherlock Holmes and Doctor W...,
9179,Nocturnal Animals,
9259,Paterson,
9367,Moonlight,
9448,The OA,
9514,Cosmos,
9515,Maria Bamford: Old Baby,


Types


Unnamed: 0,title,year
Type,object,object


<font class="answer">
   
On peut remarquer qu'il manque quelques années de parution. Il est probablement possible de les fixer en utilisant *links.imdbId* avec le web scrapping.

In [7]:
def imdb_scap_year(response, index, imdbId):
    success = False
    if response.ok:
        bs = BeautifulSoup(response.text, "html.parser")

        imdb_info = bs.find("script", attrs={"type": "application/ld+json"})
        imdb_json = json.loads(imdb_info.text)
        date_published = scrap.get_nested_property(imdb_json, ["datePublished"])
        if not success and date_published:
            date_published = pd.to_datetime(date_published)
            year = date_published.year
            success = True

        imdb_info = bs.find("script", attrs={"type": "application/json"})
        imdb_json = json.loads(imdb_info.text)
        releaseYear = scrap.get_nested_property(imdb_json, 
                                                ["props", 
                                                 "pageProps", 
                                                 "mainColumnData", 
                                                 "releaseYear",
                                                 "year"])
        if not success and releaseYear:
            year = releaseYear
            success = True

    if success:
        return (index, year, response.url)
    else:
        return (index, None, response.url)

def imdb_scap_year_apply_results(final_results):
    for index, year, url in final_results:
        if year is None:
            print(url, title_year.title[index], "Failed")
        else:
            title_year.year[index] = year

#    
# corriger year via web scrapping avec imdb
#
year_na = title_year.year.isna()
year_link = mvl_dataset.links.imdbId[ year_na ]

count = year_link.shape[0]
if count > 0:
    with hlp.Profile() as year_profile:
        results = scrap.imdb_requests_parallel(year_link,
                                               configs.web_scraping,
                                               imdb_scap_year,
                                               executor=configs.executor)
        imdb_scap_year_apply_results(results)
    print(f"Web scraping year: {year_profile.round_duration(2)}s")

# validation du scapping
clstr.show_na(title_year)

  0%|          | 0/12 [00:00<?, ?it/s]

Web scraping year: 2.04s
Valeur manquante 0 (0.0%)


In [8]:
# mettre a jour movies
mvl_dataset.movies["year"] = title_year.year.astype(np.int64)

In [9]:
#
# validation effet extraction de l'annee de title
#
imdb_ids = mvl_dataset.links.imdbId[mvl_dataset.links.movieId == mvl_dataset.movies.movieId]

title = mvl_dataset.movies[["movieId", "year"]].copy()
title["title"] = title_year.title.copy()
title["imdbId"] = imdb_ids.copy()

def title_agregate(dataframe):
    return pd.Series({"imdbIds": dataframe.imdbId.unique(),
                      "years": dataframe.year.unique(),
                      "counts": dataframe.movieId.nunique()})

groups = title.groupby("title").apply(title_agregate)
groups.sort_values(by="counts", ascending=False, inplace=True)

display(groups.head())

Unnamed: 0_level_0,imdbIds,years,counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hamlet,"[0116477, 0040416, 0058175, 0171359, 0099726]","[1996, 1948, 1964, 2000, 1990]",5
"Christmas Carol, A","[0029992, 1067106, 0216621, 0188504]","[1938, 2009, 1999, 1977]",4
"Misérables, Les","[0113828, 0119683, 1707386, 0230534]","[1995, 1998, 2012, 2000]",4
Jane Eyre,"[0116684, 0036969, 0065911, 1229822]","[1996, 1944, 1970, 2011]",4
"Three Musketeers, The","[0108333, 0040876, 0072281, 1509767]","[1993, 1948, 1973, 2011]",4


<font class="answer">
    
Une inspection manuelle de [Hamlet 1996](http://www.imdb.com/title/tt0116477) et [Hamlet 1948](http://www.imdb.com/title/tt0040416) via IMDB permet de voir que l'année juxtaposée au titre est un identifiant unique. Je dois donc garder *title* original.

In [10]:
#
# validation doublons
#
print("Vérifier doublons")
print("Avant:", mvl_dataset.movies.shape)
mvl_dataset.movies.drop_duplicates(inplace=True)
print("Après:", mvl_dataset.movies.shape)

Vérifier doublons
Avant: (9742, 4)
Après: (9742, 4)


<font class="answer">
Aucun doublon

In [11]:
# genre est de type categoriel mais est une seule string contenant toutes les modalites
# changer string en list en prevision de pandas.get_dummies() 
def genres_str_to_array(genres, splitter):
    if isinstance(genres, str):
        return np.array(genres.split(splitter))
    else:
        return genres

# assignation par .loc est imporant ici
mvl_dataset.movies.loc[:, "genres"] = mvl_dataset.movies.genres.apply(genres_str_to_array, 
                                                                      args=(configs.dataset.genre_splitter,))
# validation changement de type
display(mvl_dataset.movies.head())

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II (1995),[Comedy],1995


In [12]:
# validation modalites genres
def gather_genres(genres, final_set):
    if len(genres.shape) > 0:
        final_set.update(genres)
    
def show_unique_genres():
    genres_set = set()
    mvl_dataset.movies.genres.apply(gather_genres,
                                    args=(genres_set,))
    
    print("Genres:", len(genres_set), "modalités")
    display(genres_set)
    
show_unique_genres()

Genres: 20 modalités


{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

<font class="answer">

Une seule modalité semble étrange: IMAX. Aussi, (no genres listed) semble être en fait des valeurs manquantes. Adressé un peu plus loin dans le notebook.

In [13]:
# valider genres IMAX
def find_in_list(haystack, needle):
    return needle in haystack

def list_size(list_):
    return len(list_)

imax = mvl_dataset.movies[ mvl_dataset \
                              .movies \
                              .genres \
                              .apply(find_in_list, needle="IMAX") ].copy()
imax["counts"] = imax.genres.apply(list_size)
print("IMAX", imax.shape[0], "- min genres count", imax.counts.min())
display(imax.head())

IMAX 158 - min genres count 2


Unnamed: 0,movieId,title,genres,year,counts
123,150,Apollo 13 (1995),"[Adventure, Drama, IMAX]",1995,3
322,364,"Lion King, The (1994)","[Adventure, Animation, Children, Drama, Musica...",1994,6
512,595,Beauty and the Beast (1991),"[Animation, Children, Fantasy, Musical, Romanc...",1991,6
1328,1797,Everest (1998),"[Documentary, IMAX]",1998,2
2381,3159,Fantasia 2000 (1999),"[Animation, Children, Musical, IMAX]",1999,4


<font class="answer">
IMAX est en fait utilisé comme un "attribut"; il n'est jamais utlisé seul.

In [14]:
#
# examiner (no genres listed)
#    
def show_no_genres():
    no_genres = mvl_dataset.movies[ mvl_dataset
                                       .movies \
                                       .genres \
                                       .apply(find_in_list, needle="(no genres listed)") ]
    print("(no genres listed)", no_genres.shape[0])
    display(no_genres.head())
    return no_genres.index
    
no_genres_index = show_no_genres()

(no genres listed) 34


Unnamed: 0,movieId,title,genres,year
8517,114335,La cravate (1957),[(no genres listed)],1957
8684,122888,Ben-hur (2016),[(no genres listed)],2016
8687,122896,Pirates of the Caribbean: Dead Men Tell No Tal...,[(no genres listed)],2017
8782,129250,Superfast! (2015),[(no genres listed)],2015
8836,132084,Let It Be Me (1995),[(no genres listed)],1995


<font class="answer">
Je vais utiliser le web scrapping pour tenter de remplacer (no genres listed).

In [21]:
def imdb_scap_genres(response, index, imdbId):
    success = False
    if response.ok:
        bs = BeautifulSoup(response.text, "html.parser")

        imdb_info = bs.find("script", attrs={"type": "application/ld+json"})
        imdb_json = json.loads(imdb_info.text)
                
        genres = scrap.get_nested_property(imdb_json, ["genre"])
        if not success and genres:
            success = True
        else:
            error = json.dumps(imdb_json, indent=4)
    else:
        error = response.reason

    if success:
        return (index, genres, response.url, None)
    else:
        
        return (index, None, response.url, error)

def imdb_scap_genres_apply_results(final_results):
    for index, genres, url, error in final_results:
        if genres is None:
            if False:
                print()
                print(error)
                print()

            print(url, mvl_dataset.movies.title[index], "Failed")
        else:
            # ici, at[] est important - on assigne une list
            mvl_dataset.movies.at[index, "genres"] = np.array(genres)


# corriger genres via web scrapping avec imdb
genres_imdbIds = mvl_dataset.links.imdbId[no_genres_index]

if genres_imdbIds.shape[0] > 0:
    with hlp.Profile() as genres_profile:
        if True:
            results = scrap.imdb_requests_parallel(genres_imdbIds, 
                                                   configs.web_scraping, 
                                                   imdb_scap_genres,
                                                   executor=configs.executor)
            imdb_scap_genres_apply_results(results)
        else:
            scrap.imdb_requests_parallel(genres_imdbIds, 
                                         configs.web_scraping, 
                                         imdb_scap_genres)
    print(f"Web scraping genres: {genres_profile.round_duration(2)}s")
        
show_unique_genres()
show_no_genres();

  0%|          | 0/34 [00:00<?, ?it/s]

Web scraping genres: 4.45s
Genres: 24 modalités


{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'IMAX',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Short',
 'Thriller',
 'War',
 'Western'}

(no genres listed) 0


Unnamed: 0,movieId,title,genres,year


In [24]:
#
# sauvegarde du traitement fait sur movies.csv
#
csv_path = mvl_dataset.movies_path.split(".")[0]
csv_path = "".join([csv_path, "_pretraitement", ".csv"])

print("Sauvegarde", csv_path)
mvl_dataset.movies.to_csv(csv_path, index=False)

Sauvegarde dataset/movies_pretraitement.csv


## ratings.csv

<font class="answer">
    
Mettre description ici?

In [None]:
print("Ratings", mvl_dataset.ratings.shape)
print("Head")
display(mvl_dataset.ratings.head())
clstr.show_na(mvl_dataset.ratings)
clstr.show_types(mvl_dataset.ratings)

In [None]:
# creer des variables pour movies: avg rating, vote_count, avg_vote_count
# montrer les duplicats userId et movieId