# Sens Critique
### Pourquoi un film fonctionne ?

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re

link = 'https://www.senscritique.com'

## Collecting data

Saving best films and their ratings in the Top

In [2]:
url_sc = link + '/films/tops/top111'
page_sc = requests.get(url_sc)
soup = BeautifulSoup(page_sc.content, "html.parser")

# collectiong data from page with top films

ranking_data = {}

films = soup.find_all('div', class_="ProductListItem__Wrapper-sc-1jkxxpj-1 kusRkg")

for film in films:
    print(film)
    place = film.find('span', attrs={'data-testid': 'product-title-wrapper'}).find('span').text.strip()
    film_link = film.find('a', attrs={'data-testid': 'product-title'}).get('href')
    ratings = film.find('div', attrs={'data-testid': 'Rating'}).text
    ranking_data[place] = (film_link, ratings)
    
print(ranking_data)

<div class="ProductListItem__Wrapper-sc-1jkxxpj-1 kusRkg"><div class="ProductListItem__WrapperPoster-sc-1jkxxpj-2 gYYbKm"><div class="ProductActionsOnHover__Content-sc-1kgwlhz-0 fVFIuP" data-testid="product-actions-hover"><div class="Poster__Container-sc-yale2-1 jAoPdr" data-testid="poster" height="200" width="150"><a class="Poster__SubLink-sc-yale2-2 jhmgpI" data-testid="poster" height="200" href="/film/threat_level_midnight/449273" width="150"><span class="Poster__WrapperImage-sc-yale2-9 ZhZbK" data-srcname="https://media.senscritique.com/media/000018640834/300/threat_level_midnight.jpg" data-testid="poster-img-wrapper"><span style="box-sizing:border-box;display:inline-block;overflow:hidden;width:initial;height:initial;background:none;opacity:1;border:0;margin:0;padding:0;position:relative;max-width:100%"><span style="box-sizing:border-box;display:block;width:initial;height:initial;background:none;opacity:1;border:0;margin:0;padding:0;max-width:100%"><img alt="" aria-hidden="true" sr

In [3]:
df_ranking = pd.DataFrame.from_dict(ranking_data, 
                                      orient='index', 
                                      columns=[
                                          'film_link',
                                          'ratings'
                                          ]
                                          )

df_ranking.tail()

Unnamed: 0,film_link,ratings
46.0,/film/eve/448394,8.2
47.0,/film/les_sentiers_de_la_gloire/415777,8.2
48.0,/film/andrei_roublev/372818,8.2
49.0,/film/fenetre_sur_cour/407292,8.1
50.0,/film/m_le_maudit/380190,8.1


For films in the Top, scrape their:
- title,
- category,
- how many people liked it _(likes)_

In [4]:
film_overall_data = {}

film_links_to_scrap = np.unique(df_ranking['film_link'])

for film in film_links_to_scrap:
    print(film)
    url_film = link + film
    page_film = requests.get(url_film)
    soup_film = BeautifulSoup(page_film.content, "html.parser")
    
    genres = []

    title = soup_film.find('h1', class_='Text__SCTitle-sc-1aoldkr-1 CoverProductInfos__Title-sc-1un0kh1-1 iTBZrv UiItd').text
    category = soup_film.find('span', attrs={'data-testid': 'creators-category'}).text
    likes = soup_film.find('p', class_='Text__SCText-sc-1aoldkr-0 Stats__Text-sc-1u6v943-2 gATBvI irORIr').text
    technical_info_link = soup_film.find('a', string='Fiche technique').get('href')
    
    film_overall_data[film] = (
        title,
        category,
        likes,
        technical_info_link
    )
    
print(film_overall_data)

/film/andrei_roublev/372818
/film/apocalypse_now/488421
/film/barberousse/368097
/film/blade_runner_the_final_cut/42244431
/film/boulevard_du_crepuscule/465238
/film/dersou_ouzala/479434
/film/douze_hommes_en_colere/370894
/film/entre_le_ciel_et_l_enfer/1324688
/film/eve/448394
/film/fenetre_sur_cour/407292
/film/harakiri/402373
/film/il_etait_une_fois_dans_l_ouest/440893
/film/il_etait_une_fois_en_amerique/804173
/film/jeux_dangereux/465400
/film/l_aurore/451178
/film/la_condition_de_l_homme_1_il_n_y_a_pas_de_plus_grand_amour/480277
/film/la_femme_des_sables/409997
/film/la_vie_est_belle/434210
/film/le_bon_la_brute_et_le_truand/368376
/film/le_chagrin_et_la_pitie/434213
/film/le_dictateur/478657
/film/le_parrain/408443
/film/le_parrain_2e_partie/378648
/film/le_tombeau_des_lucioles/486492
/film/le_trou/440101
/film/le_voyage_de_chihiro/1367079
/film/les_affranchis/447001
/film/les_enfants_du_paradis/384635
/film/les_lumieres_de_la_ville/449464
/film/les_sentiers_de_la_gloire/415777
/

In [5]:
df_films_overall = pd.DataFrame.from_dict(film_overall_data, 
                                  orient='index', 
                                      columns=[
                                          'title',
                                          'category', 
                                          'likes',
                                          'technical_info_link'
                                          ]
                                          )
df_films_overall.tail()

Unnamed: 0,title,category,likes,technical_info_link
/film/sherlock_junior/494192,Sherlock Junior,Moyen-métrage,580,/film/sherlock_junior/494192/details
/film/soy_cuba/390558,Soy Cuba,Film,488,/film/soy_cuba/390558/details
/film/threat_level_midnight/449273,Threat Level Midnight,Téléfilm,123,/film/threat_level_midnight/449273/details
/film/vol_au_dessus_d_un_nid_de_coucou/447958,Vol au-dessus d'un nid de coucou,Film,6.6K,/film/vol_au_dessus_d_un_nid_de_coucou/447958/...
/film/voyage_au_bout_de_l_enfer/376439,Voyage au bout de l'enfer,Film,2.5K,/film/voyage_au_bout_de_l_enfer/376439/details


For films in the Top, scrape additional information about their:
- Technical information:
    - original title _(original_title)_
    - other titles _(also_known_as)_
    - genres	
    - release year	
    - countries of origin	
    - duration	
    - release date in the country of origin (release_date_orig)
    - release date in France _(release_date_france)_
    - budget	
    - synopsis
- People involved in the production:
    - directors _(directors_link)_
    - writers _(writers_link)_
    - producers _(producers_link)_
    - distributors _(distributors_link)_
    - actors _(actors_link)_

In [6]:
film_technical_data = {}

film_tech_links_to_scrap = np.unique(df_films_overall['technical_info_link'])

for film in film_tech_links_to_scrap:
    url_film = link + film
    page_film = requests.get(url_film)
    soup_film = BeautifulSoup(page_film.content, "html.parser")

    print(film)
    # title = soup_film.find('h1', class_='Text__SCTitle-sc-1aoldkr-1 iTBZrv').text if soup_film.find('h1', class_='Text__SCTitle-sc-1aoldkr-1 iTBZrv') else None
    original_title = soup_film.find("span", string="Titre original : ").find_next_sibling(string=True) if soup_film.find("span", string="Titre original : ") else None
    aka = [aka.get_text(strip=True) for aka in soup_film.find("span", string="Aussi connu sous le nom de : ").find_next_siblings('span')] if soup_film.find("span", string="Aussi connu sous le nom de : ") else None
    genres = [genre.get_text(strip=True) for genre in soup_film.find("span", string=re.compile(r"Genres? : ")).find_next_siblings("a")] if soup_film.find("span", string=re.compile(r"Genres? : ")) else None
    year = soup_film.find('span', string='Année : ').find_next_sibling(string=True) if soup_film.find('span', string='Année : ') else None
    countries = [country.get_text(strip=True) for country in soup_film.find('span', string='Pays d\'origine : ').find_parent().find_next_siblings()] if soup_film.find('span', string='Pays d\'origine : ') else None
    duration = soup_film.find('span', string='Durée : ').find_next_sibling(string=True) if soup_film.find('span', string='Durée : ') else None
    release_pattern = re.compile(r"Date de sortie \((?!France)([^)]+)\) :")
    release_orig = soup_film.find('span', string=release_pattern).find_next_sibling(string=True) if soup_film.find('span', string=release_pattern) else None
    release_france = soup_film.find('span', string='Date de sortie (France) : ').find_next_sibling(string=True) if soup_film.find('span', string='Date de sortie (France) : ') else None
    directors_link = soup_film.find('span', string=re.compile(r"Réalisateurs? : ")).find_parent().find_next_sibling().get('href') if soup_film.find('span', string=re.compile(r"Réalisateurs? : ")) else None
    writers_link = [writers.get('href') for writers in soup_film.find('span', string=re.compile(r"Scénaristes? : ")).find_parent().find_next_siblings("a")] if soup_film.find('span', string=re.compile(r"Scénaristes? : ")) else None
    producers_link = [producers.get('href') for producers in soup_film.find('span', string=re.compile(r"Producteurs? : ")).find_parent().find_next_siblings("a")] if soup_film.find('span', string=re.compile(r"Producteurs? : ")) else None
    distributors_link = [distributors.get('href') for distributors in soup_film.find('span', string=re.compile(r"Distributeurs? : ")).find_parent().find_next_siblings("a")] if soup_film.find('span', string=re.compile(r"Distributeurs? : ")) else None
    actors_link = [actors.get('href') for actors in soup_film.find_all('a', class_= 'Text__SCText-sc-1aoldkr-0 Link__SecondaryLink-sc-1v081j9-1 gATBvI jacWTu')] if soup_film.find_all('a', class_= 'Text__SCText-sc-1aoldkr-0 Link__SecondaryLink-sc-1v081j9-1 gATBvI jacWTu') else None
    budget = soup_film.find('span', string='Budget : ').find_next_sibling(string=True) if soup_film.find('span', string='Budget : ') else None
    synopsis = soup_film.find('span', string='Synopsis : ').find_next_sibling(string=True) if soup_film.find('span', string='Synopsis : ') else None




    film_technical_data[film] = (
        # title,
        original_title,
        aka,
        genres,
        year,
        countries,
        duration,
        release_orig,
        release_france,
        directors_link,
        writers_link,
        producers_link,
        distributors_link,
        actors_link,
        budget,
        synopsis

    )
    
print(film_technical_data)

/film/andrei_roublev/372818/details
/film/apocalypse_now/488421/details
/film/barberousse/368097/details
/film/blade_runner_the_final_cut/42244431/details
/film/boulevard_du_crepuscule/465238/details
/film/dersou_ouzala/479434/details
/film/douze_hommes_en_colere/370894/details
/film/entre_le_ciel_et_l_enfer/1324688/details
/film/eve/448394/details
/film/fenetre_sur_cour/407292/details
/film/harakiri/402373/details
/film/il_etait_une_fois_dans_l_ouest/440893/details
/film/il_etait_une_fois_en_amerique/804173/details
/film/jeux_dangereux/465400/details
/film/l_aurore/451178/details
/film/la_condition_de_l_homme_1_il_n_y_a_pas_de_plus_grand_amour/480277/details
/film/la_femme_des_sables/409997/details
/film/la_vie_est_belle/434210/details
/film/le_bon_la_brute_et_le_truand/368376/details
/film/le_chagrin_et_la_pitie/434213/details
/film/le_dictateur/478657/details
/film/le_parrain/408443/details
/film/le_parrain_2e_partie/378648/details
/film/le_tombeau_des_lucioles/486492/details
/film/

In [7]:
df_films_tech = pd.DataFrame.from_dict(film_technical_data, 
                                      orient='index', 
                                      columns=[
                                        #   'title',
                                          'original_title',
                                          'also_known_as',
                                          'genres',
                                          'release_year',
                                          'countries_of_origin',
                                          'duration',
                                          'release_date_orig',
                                          'release_date_france',
                                          'directors_link',
                                          'writers_link',
                                          'producers_link',
                                          'distributors_link',
                                          'actors_link',
                                          'budget',
                                          'synopsis'
                                          ]
                                          )

df_films_tech.tail()


Unnamed: 0,original_title,also_known_as,genres,release_year,countries_of_origin,duration,release_date_orig,release_date_france,directors_link,writers_link,producers_link,distributors_link,actors_link,budget,synopsis
/film/sherlock_junior/494192/details,Sherlock Jr.,,"[Muet, Comédie, Action, Romance]",1924,[États-Unis],45 min,11 mai 1924,28 octobre 1924,/contact/Buster_Keaton/5696,,"[/contact/Buster_Keaton/5696, /contact/Joseph_...",[/contact/Splendor_Films/422618],"[/contact/buster_keaton/5696, /contact/kathryn...",,"Un projectionniste, qui rêve d'être détective,..."
/film/soy_cuba/390558/details,,"[I Am Cuba,, Ya Kuba,, Я - Куба]",[Drame],1964,"[Cuba,, Union Soviétique]",2 h 21 min,26 octobre 1964,16 juillet 2003,/contact/Mikhail_Kalatozov/10915,"[/contact/Enrique_Pineda_Barnet/123498, /conta...",,"[/contact/MK2_Diffusion/422298, /contact/Potem...","[/contact/Sergio_Corrieri/21087, /contact/Salv...",,"A travers quatre histoires, ""Soy Cuba"" décrit ..."
/film/threat_level_midnight/449273/details,,,[Comédie],2011,[États-Unis],30 min,,17 février 2011,/contact/Tucker_Gates/1110101,"[/contact/greg_daniels/275178, /contact/b_j_no...",,,"[/contact/steve_carell/72401, /contact/rainn_w...",,"Après 11 ans de préparation, Michael Scott nou..."
/film/vol_au_dessus_d_un_nid_de_coucou/447958/details,One Flew Over the Cuckoo's Nest,,"[Drame, Comédie dramatique]",1975,[États-Unis],2 h 13 min,21 novembre 1975,1 mars 1976,/contact/Milos_Forman/614,"[/contact/Lawrence_Hauben/35309, /contact/Bo_G...","[/contact/Michael_Douglas/525, /contact/Saul_Z...",[/contact/United_Artists/556452],"[/contact/jack_nicholson/2, /contact/louise_fl...",$4 400 000,"Pour échapper à la prison, Randall P. McMurphy..."
/film/voyage_au_bout_de_l_enfer/376439/details,The Deer Hunter,,"[Drame, Guerre]",1978,"[États-Unis,, Royaume-Uni]",3 h 03 min,8 décembre 1978,7 mars 1979,/contact/Michael_Cimino/5885,[/contact/Deric_Washburn/1535256],"[/contact/Michael_Cimino/5885, /contact/Michae...",[/contact/Carlotta_Films/422349],"[/contact/robert_de_niro/13, /contact/john_caz...",15 000 000 $,Une analyse en profondeur de la façon dont la ...


For People involved in the production, save their:
- Technical information:
    - name _(person_name)_
    - description
    - how many people liked them _(likes)_

In [8]:
people_data = {}
unique_values = set()

for column in ['directors_link', 'writers_link', 'producers_link', 'distributors_link', 'actors_link']:
    for cell in df_films_tech[column]:
        if isinstance(cell, list):
            unique_values.update(cell)
        if isinstance(cell, str):
            unique_values.add(cell)

people_links_to_scrape = list(unique_values)

for person in people_links_to_scrape:
    print(person)
    url_person = link + person
    page_person = requests.get(url_person)
    soup_person = BeautifulSoup(page_person.content, "html.parser")

    person_name = soup_person.find('h1', class_="Text__SCTitle-sc-1aoldkr-1").text
    description = soup_person.find('p', class_="CoverContactInfos__Description-sc-1402al5-8").text
    likes = soup_person.find('p', class_=lambda x: x and x.startswith("Text__SCText-sc-1aoldkr-0 Stats__Text-sc-1u6v943-2 gATBvI")).text

    people_data[person] = (person_name, description, likes)
    
print(people_data)

/contact/nicholas_pileggi/96787


/contact/Emmanuel_d_Astier_de_la_Vigerie/4892844
/contact/Jean_Keraudy/7054
/contact/Saiichiro_Ujie/5446136
/contact/Albert_S_Ruddy/9255
/contact/Ernst_Lubitsch/38
/contact/george_davis/5973
/contact/Ivan_Bykov/1375427
/contact/Martin_Balsam/3627
/contact/john_krasinski/84703
/contact/harrison_ford/994
/contact/henry_daniell/486
/contact/erwin_connelly/1355845
/contact/Splendor_Films/422618
/contact/Melchior_Lengyel/501151
/contact/kathryn_mc_guire/8734
/contact/tim_roth/17693
/contact/Marc_Michel/3919
/contact/Anatoli_Solonitsyne/5072
/contact/Liubomiras_Lauciavicius/1179265
/contact/james_stewart/970
/contact/brad_dourif/1589
/contact/Irma_Raush/1984026
/contact/akira_ishihama/98233
/contact/tetsuro_tamba/505977
/contact/Action_Cinemas_Theatre_du_Temple/422327
/contact/sean_young/1591
/contact/celeste_holm/9067
/contact/ari_aster/896473
/contact/Ivan_Lapikov/11061
/contact/rutger_hauer/4463
/contact/erich_von_stroheim/2913
/contact/Akemi_Yamaguchi/31652
/contact/Jean_Paul_Coquelin/11

In [9]:
df_people = pd.DataFrame.from_dict(people_data, 
                                      orient='index', 
                                      columns=['person_name', 'description', 'likes']
                                          )

df_people.tail()

Unnamed: 0,person_name,description,likes
/contact/Nobuo_Nakamura/6480,Nobuo Nakamura,acteur japonais né le 13 septembre 1908 à Otar...,4 aiment
/contact/Hiroyuki_Nishimoto/26790,Hiroyuki Nishimoto,acteur.,0
/contact/Charles_de_Lauzirika/543812,Charles de Lauzirika,"acteur, réalisateur, producteur et scénariste.",5 aiment
/contact/Ginzo_Sekiguchi/26788,Ginzô Sekiguchi,acteur.,0
/contact/Bino_Cicogna/51656,Bino Cicogna,producteur.,0


## Cleaning

Chack what we have now

In [10]:
# we have
print('df_ranking')
print(df_ranking.dtypes)
display(df_ranking.tail(1))

print('df_films_overall')
print(df_films_overall.dtypes)
display(df_films_overall.tail(1))

print('df_films_tech')
print(df_films_tech.dtypes)
display(df_films_tech.tail(1))

print('df_people')
print(df_people.dtypes)
display(df_people.tail(1))

df_ranking
film_link    object
ratings      object
dtype: object


Unnamed: 0,film_link,ratings
50.0,/film/m_le_maudit/380190,8.1


df_films_overall
title                  object
category               object
likes                  object
technical_info_link    object
dtype: object


Unnamed: 0,title,category,likes,technical_info_link
/film/voyage_au_bout_de_l_enfer/376439,Voyage au bout de l'enfer,Film,2.5K,/film/voyage_au_bout_de_l_enfer/376439/details


df_films_tech
original_title         object
also_known_as          object
genres                 object
release_year           object
countries_of_origin    object
duration               object
release_date_orig      object
release_date_france    object
directors_link         object
writers_link           object
producers_link         object
distributors_link      object
actors_link            object
budget                 object
synopsis               object
dtype: object


Unnamed: 0,original_title,also_known_as,genres,release_year,countries_of_origin,duration,release_date_orig,release_date_france,directors_link,writers_link,producers_link,distributors_link,actors_link,budget,synopsis
/film/voyage_au_bout_de_l_enfer/376439/details,The Deer Hunter,,"[Drame, Guerre]",1978,"[États-Unis,, Royaume-Uni]",3 h 03 min,8 décembre 1978,7 mars 1979,/contact/Michael_Cimino/5885,[/contact/Deric_Washburn/1535256],"[/contact/Michael_Cimino/5885, /contact/Michae...",[/contact/Carlotta_Films/422349],"[/contact/robert_de_niro/13, /contact/john_caz...",15 000 000 $,Une analyse en profondeur de la façon dont la ...


df_people
person_name    object
description    object
likes          object
dtype: object


Unnamed: 0,person_name,description,likes
/contact/Bino_Cicogna/51656,Bino Cicogna,producteur.,0


In DataFrames, change saved *links* to *indexes* to connect tables on them

In [11]:
# deleting links and saving indexes


def extract_index(url):
    match = re.search(r'/(\d+)$', url) # Use regular expression to extract the index (digits) from the URL
    if match:
        return int(match.group(1))  # Convert to integer
    else:
        return None

def url_to_index_values(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda cell: [extract_index(item) if isinstance(item, str) and re.search(r'/(\d+)$', item)
                                            else item for item in cell] if isinstance(cell, list) 
                                            else extract_index(cell) if isinstance(cell, str) and re.search(r'/(\d+)$', cell) 
                                            else cell)
    return df

def change_column_names(df):
    renamed_columns = {}
    for col in df.columns:
        if col.endswith('_link'):
            new_col_name = col[:-5] + '_index'
            renamed_columns[col] = new_col_name

    return df.rename(columns=renamed_columns)

def url_to_index(df):
    df = url_to_index_values(df)
    df = change_column_names(df)
    return df

    

df_ranking_cleaned = df_ranking.copy()
df_ranking_cleaned.reset_index(inplace=True)
df_ranking_cleaned = url_to_index(df_ranking_cleaned)

df_films_cleaned = pd.concat([df_films_overall, df_films_tech.rename(index=lambda x: x.replace('/details', ''))], axis=1, ignore_index=False).drop(columns=['technical_info_link'])
df_films_cleaned.reset_index(inplace=True)
df_films_cleaned = url_to_index(df_films_cleaned)

df_people_cleaned = df_people.copy()
df_people_cleaned.reset_index(inplace=True)
df_people_cleaned = url_to_index(df_people_cleaned)

      
display(df_ranking_cleaned.tail())
display(df_films_cleaned.tail())
display(df_people_cleaned.tail())


Unnamed: 0,index,film_index,ratings
45,46.0,448394,8.2
46,47.0,415777,8.2
47,48.0,372818,8.2
48,49.0,407292,8.1
49,50.0,380190,8.1


Unnamed: 0,index,title,category,likes,original_title,also_known_as,genres,release_year,countries_of_origin,duration,release_date_orig,release_date_france,directors_index,writers_index,producers_index,distributors_index,actors_index,budget,synopsis
45,494192,Sherlock Junior,Moyen-métrage,580,Sherlock Jr.,,"[Muet, Comédie, Action, Romance]",1924,[États-Unis],45 min,11 mai 1924,28 octobre 1924,5696,,"[5696, 4864983]",[422618],"[5696, 8734, 8831, 1355845, 12334, 5855205, 59...",,"Un projectionniste, qui rêve d'être détective,..."
46,390558,Soy Cuba,Film,488,,"[I Am Cuba,, Ya Kuba,, Я - Куба]",[Drame],1964,"[Cuba,, Union Soviétique]",2 h 21 min,26 octobre 1964,16 juillet 2003,10915,"[123498, 3015618]",,"[422298, 1126663]","[21087, 60003, 74774, 3226987, 74773, 932, 518...",,"A travers quatre histoires, ""Soy Cuba"" décrit ..."
47,449273,Threat Level Midnight,Téléfilm,123,,,[Comédie],2011,[États-Unis],30 min,,17 février 2011,1110101,"[275178, 91341]",,,"[72401, 54016, 84703, 105750, 106764, 91341, 8...",,"Après 11 ans de préparation, Michael Scott nou..."
48,447958,Vol au-dessus d'un nid de coucou,Film,6.6K,One Flew Over the Cuckoo's Nest,,"[Drame, Comédie dramatique]",1975,[États-Unis],2 h 13 min,21 novembre 1975,1 mars 1976,614,"[35309, 496203]","[525, 625, 11018]",[556452],"[2, 11016, 11003, 7307, 1589, 32102, 7, 527, 1...",$4 400 000,"Pour échapper à la prison, Randall P. McMurphy..."
49,376439,Voyage au bout de l'enfer,Film,2.5K,The Deer Hunter,,"[Drame, Guerre]",1978,"[États-Unis,, Royaume-Uni]",3 h 03 min,8 décembre 1978,7 mars 1979,5885,[1535256],"[5885, 5446312, 59968, 31541]",[422349],"[13, 9182, 1867, 759, 14, 22461, 59966, 20425,...",15 000 000 $,Une analyse en profondeur de la façon dont la ...


Unnamed: 0,index,person_name,description,likes
657,6480,Nobuo Nakamura,acteur japonais né le 13 septembre 1908 à Otar...,4 aiment
658,26790,Hiroyuki Nishimoto,acteur.,0
659,543812,Charles de Lauzirika,"acteur, réalisateur, producteur et scénariste.",5 aiment
660,26788,Ginzô Sekiguchi,acteur.,0
661,51656,Bino Cicogna,producteur.,0


For numbers, change their type to numerical

In [12]:
# fixing wrong format. ex: '46.' -> '46'

df_ranking_cleaned['index'] = df_ranking_cleaned['index'].str.rstrip('.').astype(int) 

# display(df_ranking_cleaned.tail())

In [13]:
def custom_converter(value): # Define a custom function to convert values to integers
    try:
        # Try to convert to integer directly
        return int(value)
    except ValueError:
        # If it's not a plain integer, check for 'K' and 'M'
        if value.endswith('K'):
            return int(float(value[:-1]) * 1000)
        elif value.endswith('M'):
            return int(float(value[:-1]) * 1000000)
        else:
            # Handle other cases as needed
            return None

df_films_cleaned['likes'] = df_films_cleaned['likes'].apply(custom_converter)
        
# display(df_films_cleaned.tail())

In [14]:
def clean_likes(value):
    try:
        return int(value)
    except ValueError:
        if 'aiment' in value:
            val = value.replace('aiment', '')
            try:
                return int(val)
            except ValueError:
                return custom_converter(val)
        else:
            return None


df_people_cleaned['likes'] = df_people_cleaned['likes'].apply(clean_likes)

# display(df_people_cleaned.tail())

If DataFrame will be used in SQL, creating new connections DataFrames for columns with lists, since SQL cannot read lists 

In [15]:
# handling lists
# creating new tables with connections 
# bc sql can't read lists

# there are only lists in df_films_cleaned

def seperate_lists(df):
    exploded = {}
    for column in df.columns:
        if isinstance(df[column].iloc[0], list):  # Check if cell in the column contains a list
            exploded[column[:-6] if '_index' in column else column] = df[['index', column]].explode(column).dropna()
            df = df.drop(column, axis=1)
    df_exploded = {}
    for key, value in exploded.items():
        df_exploded[key] = pd.DataFrame(value)
    return(df, df_exploded)

df_films_cleaned_shortened, df_film_extra = seperate_lists(df_films_cleaned)


display(df_films_cleaned_shortened.tail())
for key, value in df_film_extra.items():
    display(df_film_extra[key].tail())



Unnamed: 0,index,title,category,likes,original_title,release_year,duration,release_date_orig,release_date_france,directors_index,budget,synopsis
45,494192,Sherlock Junior,Moyen-métrage,580,Sherlock Jr.,1924,45 min,11 mai 1924,28 octobre 1924,5696,,"Un projectionniste, qui rêve d'être détective,..."
46,390558,Soy Cuba,Film,488,,1964,2 h 21 min,26 octobre 1964,16 juillet 2003,10915,,"A travers quatre histoires, ""Soy Cuba"" décrit ..."
47,449273,Threat Level Midnight,Téléfilm,123,,2011,30 min,,17 février 2011,1110101,,"Après 11 ans de préparation, Michael Scott nou..."
48,447958,Vol au-dessus d'un nid de coucou,Film,6600,One Flew Over the Cuckoo's Nest,1975,2 h 13 min,21 novembre 1975,1 mars 1976,614,$4 400 000,"Pour échapper à la prison, Randall P. McMurphy..."
49,376439,Voyage au bout de l'enfer,Film,2500,The Deer Hunter,1978,3 h 03 min,8 décembre 1978,7 mars 1979,5885,15 000 000 $,Une analyse en profondeur de la façon dont la ...


Unnamed: 0,index,also_known_as
43,404608,"Requiem pour un Maϟϟacre,"
43,404608,Иди и смотри
46,390558,"I Am Cuba,"
46,390558,"Ya Kuba,"
46,390558,Я - Куба


Unnamed: 0,index,genres
47,449273,Comédie
48,447958,Drame
48,447958,Comédie dramatique
49,376439,Drame
49,376439,Guerre


Unnamed: 0,index,countries_of_origin
46,390558,Union Soviétique
47,449273,États-Unis
48,447958,États-Unis
49,376439,"États-Unis,"
49,376439,Royaume-Uni


Unnamed: 0,index,writers_index
47,449273,275178
47,449273,91341
48,447958,35309
48,447958,496203
49,376439,1535256


Unnamed: 0,index,producers_index
48,447958,11018
49,376439,5885
49,376439,5446312
49,376439,59968
49,376439,31541


Unnamed: 0,index,distributors_index
45,494192,422618
46,390558,422298
46,390558,1126663
48,447958,556452
49,376439,422349


Unnamed: 0,index,actors_index
49,376439,22461
49,376439,59966
49,376439,20425
49,376439,22894
49,376439,1109124


In [16]:
# deleing commas at the end of variables
for key, value in df_film_extra.items():
    try:
        df_film_extra[key].iloc[:, 1] = df_film_extra[key].iloc[:, 1].str.replace(',', '')
    except:
        pass
    display(df_film_extra[key].tail())

Unnamed: 0,index,also_known_as
43,404608,Requiem pour un Maϟϟacre
43,404608,Иди и смотри
46,390558,I Am Cuba
46,390558,Ya Kuba
46,390558,Я - Куба


Unnamed: 0,index,genres
47,449273,Comédie
48,447958,Drame
48,447958,Comédie dramatique
49,376439,Drame
49,376439,Guerre


Unnamed: 0,index,countries_of_origin
46,390558,Union Soviétique
47,449273,États-Unis
48,447958,États-Unis
49,376439,États-Unis
49,376439,Royaume-Uni


Unnamed: 0,index,writers_index
47,449273,275178
47,449273,91341
48,447958,35309
48,447958,496203
49,376439,1535256


Unnamed: 0,index,producers_index
48,447958,11018
49,376439,5885
49,376439,5446312
49,376439,59968
49,376439,31541


Unnamed: 0,index,distributors_index
45,494192,422618
46,390558,422298
46,390558,1126663
48,447958,556452
49,376439,422349


Unnamed: 0,index,actors_index
49,376439,22461
49,376439,59966
49,376439,20425
49,376439,22894
49,376439,1109124


## Result

Show seperate DataFrames with connections

In [17]:
print('df_films_cleaned_shortened')
display(df_films_cleaned_shortened.tail())

for key, value in df_film_extra.items():
    print('df_films_to_' + key)
    display(df_film_extra[key].tail())

df_films_cleaned_shortened


Unnamed: 0,index,title,category,likes,original_title,release_year,duration,release_date_orig,release_date_france,directors_index,budget,synopsis
45,494192,Sherlock Junior,Moyen-métrage,580,Sherlock Jr.,1924,45 min,11 mai 1924,28 octobre 1924,5696,,"Un projectionniste, qui rêve d'être détective,..."
46,390558,Soy Cuba,Film,488,,1964,2 h 21 min,26 octobre 1964,16 juillet 2003,10915,,"A travers quatre histoires, ""Soy Cuba"" décrit ..."
47,449273,Threat Level Midnight,Téléfilm,123,,2011,30 min,,17 février 2011,1110101,,"Après 11 ans de préparation, Michael Scott nou..."
48,447958,Vol au-dessus d'un nid de coucou,Film,6600,One Flew Over the Cuckoo's Nest,1975,2 h 13 min,21 novembre 1975,1 mars 1976,614,$4 400 000,"Pour échapper à la prison, Randall P. McMurphy..."
49,376439,Voyage au bout de l'enfer,Film,2500,The Deer Hunter,1978,3 h 03 min,8 décembre 1978,7 mars 1979,5885,15 000 000 $,Une analyse en profondeur de la façon dont la ...


df_films_to_also_known_as


Unnamed: 0,index,also_known_as
43,404608,Requiem pour un Maϟϟacre
43,404608,Иди и смотри
46,390558,I Am Cuba
46,390558,Ya Kuba
46,390558,Я - Куба


df_films_to_genres


Unnamed: 0,index,genres
47,449273,Comédie
48,447958,Drame
48,447958,Comédie dramatique
49,376439,Drame
49,376439,Guerre


df_films_to_countries_of_origin


Unnamed: 0,index,countries_of_origin
46,390558,Union Soviétique
47,449273,États-Unis
48,447958,États-Unis
49,376439,États-Unis
49,376439,Royaume-Uni


df_films_to_writers


Unnamed: 0,index,writers_index
47,449273,275178
47,449273,91341
48,447958,35309
48,447958,496203
49,376439,1535256


df_films_to_producers


Unnamed: 0,index,producers_index
48,447958,11018
49,376439,5885
49,376439,5446312
49,376439,59968
49,376439,31541


df_films_to_distributors


Unnamed: 0,index,distributors_index
45,494192,422618
46,390558,422298
46,390558,1126663
48,447958,556452
49,376439,422349


df_films_to_actors


Unnamed: 0,index,actors_index
49,376439,22461
49,376439,59966
49,376439,20425
49,376439,22894
49,376439,1109124


Show DataFrames with all data

In [18]:
print('df_ranking_cleaned')
display(df_ranking_cleaned.tail())

print('df_films_cleaned')
display(df_films_cleaned.tail())

print('df_people_cleaned')
display(df_people_cleaned.tail())


df_ranking_cleaned


Unnamed: 0,index,film_index,ratings
45,46,448394,8.2
46,47,415777,8.2
47,48,372818,8.2
48,49,407292,8.1
49,50,380190,8.1


df_films_cleaned


Unnamed: 0,index,title,category,likes,original_title,also_known_as,genres,release_year,countries_of_origin,duration,release_date_orig,release_date_france,directors_index,writers_index,producers_index,distributors_index,actors_index,budget,synopsis
45,494192,Sherlock Junior,Moyen-métrage,580,Sherlock Jr.,,"[Muet, Comédie, Action, Romance]",1924,[États-Unis],45 min,11 mai 1924,28 octobre 1924,5696,,"[5696, 4864983]",[422618],"[5696, 8734, 8831, 1355845, 12334, 5855205, 59...",,"Un projectionniste, qui rêve d'être détective,..."
46,390558,Soy Cuba,Film,488,,"[I Am Cuba,, Ya Kuba,, Я - Куба]",[Drame],1964,"[Cuba,, Union Soviétique]",2 h 21 min,26 octobre 1964,16 juillet 2003,10915,"[123498, 3015618]",,"[422298, 1126663]","[21087, 60003, 74774, 3226987, 74773, 932, 518...",,"A travers quatre histoires, ""Soy Cuba"" décrit ..."
47,449273,Threat Level Midnight,Téléfilm,123,,,[Comédie],2011,[États-Unis],30 min,,17 février 2011,1110101,"[275178, 91341]",,,"[72401, 54016, 84703, 105750, 106764, 91341, 8...",,"Après 11 ans de préparation, Michael Scott nou..."
48,447958,Vol au-dessus d'un nid de coucou,Film,6600,One Flew Over the Cuckoo's Nest,,"[Drame, Comédie dramatique]",1975,[États-Unis],2 h 13 min,21 novembre 1975,1 mars 1976,614,"[35309, 496203]","[525, 625, 11018]",[556452],"[2, 11016, 11003, 7307, 1589, 32102, 7, 527, 1...",$4 400 000,"Pour échapper à la prison, Randall P. McMurphy..."
49,376439,Voyage au bout de l'enfer,Film,2500,The Deer Hunter,,"[Drame, Guerre]",1978,"[États-Unis,, Royaume-Uni]",3 h 03 min,8 décembre 1978,7 mars 1979,5885,[1535256],"[5885, 5446312, 59968, 31541]",[422349],"[13, 9182, 1867, 759, 14, 22461, 59966, 20425,...",15 000 000 $,Une analyse en profondeur de la façon dont la ...


df_people_cleaned


Unnamed: 0,index,person_name,description,likes
657,6480,Nobuo Nakamura,acteur japonais né le 13 septembre 1908 à Otar...,4.0
658,26790,Hiroyuki Nishimoto,acteur.,0.0
659,543812,Charles de Lauzirika,"acteur, réalisateur, producteur et scénariste.",5.0
660,26788,Ginzô Sekiguchi,acteur.,0.0
661,51656,Bino Cicogna,producteur.,0.0
