In [4]:
import requests, csv
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from collections import OrderedDict
import tqdm


In [3]:
def movie_imdb(imdb_id):

    #SELECT SYNOPSIS, GENRES, RATING, NUMBER RATE, YEAR, TITLE, ORIGINAL TITLE, COUNTRY,
    #SELECT LANGUAGE, BUDGET

    r1 = requests.get("https://www.imdb.com/title/"+imdb_id)
    bs = BeautifulSoup(r1.text, "html.parser")

    # Dictionnary for title, orginal_title, rating, num_rate

    info_dict = bs.find('script', {'type': 'application/ld+json'})
    info_dict = info_dict.text
    info_dict = OrderedDict(eval(info_dict))

    original_title = info_dict["name"]

    if original_title.find("&apos;") != -1:
        original_title = original_title.replace("&apos;", "'")

    if original_title.find("&amp") != -1:
        original_title = original_title.replace("&amp", "&")

    try :
        title = info_dict["alternateName"]
    except KeyError:
        title = info_dict["name"]

    if title.find("&amp") != -1:
        title = title.replace("&amp", "&")

    if title.find("&apos;") != -1:
        title = title.replace("&apos;", "'")

    try :
        rating = info_dict['aggregateRating']['ratingValue']
        num_rate = int(info_dict['aggregateRating']['ratingCount'])
    except KeyError:
        rating = ""
        num_rate = ""

    # Info budget bud_currency

    try:
        info_bud = bs.findAll('section', {'data-testid': 'BoxOffice'})
        info_bud = [text.text for text in info_bud][0]
        price = info_bud.split("EditBudget")[1].split(" ")[0]
        budget = int(price[1:].replace(',',''))
        bud_currency = price[0]
    except (IndexError, ValueError):
        budget = ""
        bud_currency = ""


    # Info YEAR

    year = bs.findAll('a', {'class': re.compile('ipc-link')})
    year = int([text.text for text in year][0].strip("– "))

    # Info genres
    genre_bs = bs.findAll('a', {'class': re.compile('ipc-metadata-list-item')})
    genre_bs = [text.text for text in genre_bs]

    genre_list = ["Action", "Adult", "Adventure", "Animation", "Biography", "Comedy", "Crime", "Documentary",
              "Drama", "Family", "Fantasy", "Film-Noir", "History", "Horror", "Music", "Musical", "Mystery",
              "News", "Reality-TV", "Romance", "Sci-Fi", "Short", "Sport", "Thriller", "War", "Western"]

    genres = [genre for genre in genre_bs if genre in genre_list]

    # Info SYNOPSIS
    try :
        syn = bs.findAll('div', {'class': 'ipc-html-content ipc-html-content--base'})[0]
        syn = [text.text for text in syn]
        synopsis = syn[0].split(" —")[0]
    except IndexError:
        synopsis = ""


    countries = bs.findAll('a', {'href': re.compile('country_of_origin')})
    if len(countries) == 0:
        country = ['None']
    else:
        country = ["USA" if text.text == "United States" else "UK" if text.text == "United Kingdom" \
                    else text.text for text in countries]


    language = bs.findAll('a', {'href': re.compile('primary_language')})
    if len(language) == 0:
        language = ["No Info"]
    else :
        language = [text.text for text in language]


    #SELECT CASTING, DIRECTOR, WRITERS

    r2 = requests.get("https://www.imdb.com/title/"+imdb_id+"/fullcredits")
    bs2 = BeautifulSoup(r2.text, "html.parser")

    #select casting
    cast_list = bs2.find_all('table', {'class': 'cast_list'})
    if cast_list == None or cast_list == []:
        cast = [""]
        cast_code = [""]
        cast_voice = [""]
    else :
        cast_list_test = bs2.find('table', {'class': 'cast_list'}).find_all('td', {'class':'primary_photo'})
        cast_list_char = bs2.find('table', {'class': 'cast_list'}).find_all('td', {'class':'character'})
        cast_char = [text.text.strip().replace(' \n', '') for text in cast_list_char]
        if len(cast_list_test) >= 30 :
            cast = [str(cast_list).split('img alt="')[i].split('" class="')[0] for i in range(1,31)]
            cast_code = [str(cast_list_test).split('/name/')[i].split('/">')[0] for i in range(1,31)]
            cast_voice = ["V" if "(voice)" in text else "U" if "uncredited" in text else "" \
                          for text in cast_char[:30]]
        else :
            cast = [str(cast_list).split('img alt="')[i].split('" class="')[0] \
                    for i in range(1,len(cast_list_test)+1)]
            cast_code = [str(cast_list_test).split('/name/')[i].split('/">')[0] \
                          for i in range(1,len(cast_list_test)+1)]
            cast_voice = ["V" if "(voice)" in text else "U" if "uncredited" in text else "" \
                          for text in cast_char[:len(cast_list_test)]]


    #select director
    test_dir = bs2.find('h4', {'id':'director'})
    if test_dir == None:
        director = [""]
        dir_code = [""]
    else :
        direct = bs2.find('table', {'class': 'simpleTable simpleCreditsTable'}) \
                .find_all('td', {'class': 'name'})
        direct_t = [text.text for text in direct]
        director = [i.strip() for i in direct_t]
        dir_code = [(str(direct).split('href="/name/')[i+1]).split('/"> ')[0] for i in range(len(director))]

    #SELECT RUNTIME

    r3 = requests.get("https://www.imdb.com/title/"+imdb_id+"/technical")
    bs3 = BeautifulSoup(r3.text, "html.parser")

    time = bs3.find('table', {'class': 'dataTable labelValueTable'}).find_all('tr', {'class': 'odd'})
    runtime = [text.text for text in time]
    try:
        if ("min (" in (runtime[0].strip())) and (" min)" in (runtime[0].strip())):
            duration = int((runtime[0].strip().split("min (")[1]).split(" min)")[0])
        elif "hr (" in (runtime[0].strip()):
            duration = int((runtime[0].strip().split("hr (")[1]).split(" min)")[0])
        else :
            duration = int((runtime[0].strip().split("Runtime")[1]).split(" min")[0])
    except (ValueError, IndexError):
        duration = ""


    #SELECT PRODUCTION

    r4 = requests.get("https://www.imdb.com/title/"+imdb_id+"/companycredits")
    bs4 = BeautifulSoup(r4.content, "html.parser")
    test = bs4.find('h4', {'id': 'production'})
    if test == None:
        production = [""]
        prod_code = [""]
    else :
        prod_list = bs4.find('ul', {'class': 'simpleList'}).find_all("li")
        prod_t = [text.text.strip() for text in prod_list]
        production = [pro.split("   ")[0] if "   " in pro else pro for pro in prod_t]

        prod_code = [(str(i).split("pany/")[1]).split('">')[0] for i in prod_list]

    if rating != "":
        return imdb_id, title, original_title, year, director, dir_code, cast, cast_code, cast_voice, genres, duration, \
        country, language, production, prod_code, synopsis, rating, num_rate, budget, bud_currency
    else:
        return imdb_id, title,"","","","","","","","","","","","","","","","","",""

In [5]:
df = pd.read_csv('movies_to_scrap/scrap_it.csv')

In [6]:
imdb_id = df["imdb"].values

In [5]:
db_df = pd.read_csv('db_backup/movies_db.csv')

In [6]:
imdb_db = db_df["imdb_id"].values

In [7]:
final_imdb = [i for i in imdb_id if i not in imdb_db]

In [8]:
final_imdb

['tt16579354', 'tt6013920', 'tt11286314']

In [8]:
movies = []

In [9]:
for movie in tqdm.tqdm(imdb_id):
    movies.append(movie_imdb(movie))

100%|██████████| 18/18 [01:02<00:00,  3.45s/it]


In [10]:
movies

[('tt13723064',
  'Tous mes amis sont morts',
  'Wszyscy moi przyjaciele nie zyja',
  2020,
  ['Jan Belcl'],
  ['nm5282441'],
  ['Michal Meyer',
   'Adam Woronowicz',
   'Julia Wieniawa-Narkiewicz',
   'Adam Turczyk',
   'Nikodem Rozbicki',
   'Monika Krzywkowska',
   'Szymon Roszak',
   'Michal Sikorski',
   'Adam Bobik',
   'Tomasz Karolak',
   'Mateusz Wieclawek',
   'Yassine Fadel',
   'Bartlomiej Firlet',
   'Wojciech Lozowski',
   'Aleksandra Pisula',
   'Paulina Galazka',
   'Magdalena Perlinska',
   'Konrad Zygadlo',
   'Katarzyna Chojnacka',
   'Kamil Piotrowski',
   'Dominika Sakowicz',
   'Barbara Garstka',
   'Mattia Rosinski',
   'Jaroslaw Boberek',
   'Ewa Kania',
   'Elie Rosinski',
   'Ewa Grygo',
   'Izabela Grygo',
   'Rafal Rosiak',
   'Maciej Dabrowski'],
  ['nm3130314',
   'nm0941575',
   'nm8709489',
   'nm8432916',
   'nm5279667',
   'nm0473102',
   'nm6379052',
   'nm9300957',
   'nm5216205',
   'nm0439878',
   'nm5851110',
   'nm4475248',
   'nm3654338',
   'nm

In [11]:
movie_df = pd.DataFrame(movies)

In [12]:
movie_keep = movie_df.drop(movie_df[movie_df[3] == ""].index)
too_soon = pd.DataFrame(movie_df[[0,1]].drop(movie_df[movie_df[3] != ""].index).values, \
            columns= ['imdb_id', 'title'])

In [164]:
if len(too_soon) > 0:
    f_scrap = pd.read_csv('movies_to_scrap/too_soon.csv').set_index("imdb_id")
    f_scrap_df = pd.DataFrame(f_scrap, columns= ['imdb_id', 'title'])
    too_soon = pd.concat([f_scrap_df, too_soon]).set_index("imdb_id")
    too_soon.to_csv('movies_to_scrap/too_soon.csv')

In [13]:
movie_keep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,tt13723064,Tous mes amis sont morts,Wszyscy moi przyjaciele nie zyja,2020,[Jan Belcl],[nm5282441],"[Michal Meyer, Adam Woronowicz, Julia Wieniawa...","[nm3130314, nm0941575, nm8709489, nm8432916, n...","[, , , , , , , , , V, , , , , , , , , , , , , ...","[Comedy, Crime, Drama]",96,[Poland],"[Polish, French]",[Aurum Film],[co0517033],During the New Year's Eve party of a bunch of ...,5.8,5921,,
1,tt8493970,L'ennemi,L'ennemi,2020,[Stephan Streker],[nm1437690],"[Jérémie Renier, Alma Jodorowsky, Emmanuelle B...","[nm0753737, nm2300160, nm0073384, nm9029751, n...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[Crime, Drama]",105,"[Belgium, France, Luxembourg]","[French, Flemish]","[Daylight Films, Formosa Productions, BAC Ciné...","[co0592341, co0097625, co0847566, co0011677, c...",Prominent politician Louis Durieux is accused ...,6.7,81,,
2,tt10986222,Nos âmes d'enfants,C'mon C'mon,2021,[Mike Mills],[nm0590122],"[Joaquin Phoenix, Gaby Hoffmann, Woody Norman,...","[nm0001618, nm0000451, nm7035760, nm1058940, n...","[, , , , , , , , , , , , , , , , , , , , , , ,...",[Drama],109,[USA],[English],"[A24, Be Funny When You Can]","[co0390816, co0879142]",When his sister asks him to look after her son...,7.6,12610,,
3,tt11956066,Souterrain,Souterrain,2020,[Sophie Dupuis],[nm2876079],"[Joakim Robillard, Jean-François Boudreau, Gui...","[nm8873841, nm1047757, nm2280428, nm0284785, n...","[, , , , , , , , , , , , ]",[Drama],97,[Canada],[French],"[Bravo Charlie, Bord Cadre Films, SODEC, Sover...","[co0664701, co0149598, co0299307, co0137361, c...",Maxime is a miner with a troubled past. After ...,7.1,208,,
4,tt11087960,Un monde,Un monde,2021,[Laura Wandel],[nm3218491],"[Maya Vanderbeque, Günter Duret, Lena Girard V...","[nm11042810, nm10108593, nm9708142, nm6497350,...","[, , , , , , , , , , , , , , , , , , , V]",[Drama],72,[Belgium],[French],"[Dragons Films, Lunanime, Centre du Cinéma et ...","[co0211102, co0310366, co0009031, co0109106, c...",When Nora witnesses Abel being bullied by othe...,7.3,847,1000000.0,€
5,tt15067124,Municipale,Municipale,2021,[Thomas Paulot],[nm12766683],"[Laurent Papot, Milan Alfonsi, Ferdinand Flame]","[nm2099990, nm12766685, nm12766684]","[, , ]",[Drama],110,[France],[French],"[L'Heure d'été, Indéfilms 9, Région Grand Est,...","[co0893536, co0868170, co0627173, co0090770, c...","The small town of Revin, in the French Ardenne...",7.0,8,,
6,tt13553662,Presque,Presque,2021,"[Bernard Campan, Alexandre Jollien]","[nm0132064, nm4056825]","[Bernard Campan, Alexandre Jollien, Tiphaine D...","[nm0132064, nm4056825, nm4401484, nm0712208, n...","[, , , , , , , , , , , , , , , , , , , , , , ,...",[Comedy],92,"[France, Switzerland]",[French],"[Pan Européenne, France 3 Cinéma, Apollo Films...","[co0060537, co0044140, co0678648, co0023544, c...","Two men, why vastly different life stories and...",7.4,70,,
7,tt13492256,Adieu Paris,Adieu Paris,2021,[Edouard Baer],[nm0046347],"[Pierre Arditi, Jackie Berroyer, Gérard Daguer...","[nm0034079, nm0077449, nm1065025, nm1460782, n...","[, , , , , , , , , , , , , , , , , , , , , , , ]",[Comedy],96,"[France, Belgium]",[French],"[Cinéfrance Studios, Les Productions en Cabine...","[co0726043, co0073012, co0230609, co0036163, c...",Eight men in an old Parisian bar. They were th...,5.2,34,,
8,tt13846472,Les promesses,Les promesses,2021,[Thomas Kruithof],[nm5535941],"[Isabelle Huppert, Reda Kateb, Naidra Ayadi, J...","[nm0001376, nm3024530, nm1929699, nm0096195, n...","[, , , , , , , , , , , , , , , , , , , , , , ,...",[Drama],98,[France],[French],"[2425 Films, Wild Bunch, France 2 Cinéma, Les ...","[co0358196, co0024845, co0056755, co0046236, c...",A story that revolves around Clemence who's in...,6.1,167,,
9,tt11299998,Sunless Shadows,Sunless Shadows,2019,[Mehrdad Oskouei],[nm1964867],[],[],[],[Documentary],74,"[Iran, Norway]",[Persian],"[Film E-Emrooz, Indie Film as, Oskouei Film Pr...","[co0051478, co0252212, co0402128]","In an Iranian juvenile detention center, a gro...",7.4,130,,


## Lists of data

In [38]:
    imdb = list(movie_keep[0].values)
    title = list(movie_keep[1].values)
    original_title = list(movie_keep[2].values)
    year = list(movie_keep[3].values)
    director = list(movie_keep[4].values)
    dir_code = list(movie_keep[5].values)
    actor = list(movie_keep[6].values)
    actor_code = list(movie_keep[7].values)
    actor_status = list(movie_keep[8].values)
    genres = list(movie_keep[9].values)
    duration = list(movie_keep[10].values)
    country = list(movie_keep[11].values)
    language = list(movie_keep[12].values)
    production = list(movie_keep[13].values)
    prod_code = list(movie_keep[14].values)
    synopsis = list(movie_keep[15].values)
    rating = list(movie_keep[16].values)
    num_rate = list(movie_keep[17].values)
    budget = list(movie_keep[18].values)
    bud_currency = list(movie_keep[19].values)
    saw = ["" for i in range(len(imdb))]
    db_saw = [False for i in range(len(imdb))]

## Update country_sql

In [431]:
state_df = pd.read_csv('db_backup/country_db.csv')

In [432]:
code_state = list(state_df.country_id.values)
countries = list(state_df.name.values)

In [433]:
max_country_id = max(code_state)

In [434]:
update_state = []
update_state_code = []

In [435]:
for i in range(len(country)):
    for j in range(len(country[i])):
        if country[i][j] not in countries and country[i][j] != "":
            update_state.append(country[i][j])
            update_state_code.append(max_country_id + 1)
            max_country_id += 1

In [436]:
state_data = {"country_id": update_state_code, "name": update_state}

In [437]:
up_country = pd.DataFrame(state_data).set_index("country_id")
country_df = pd.concat([state_df, up_country])

In [143]:
for i in range(len(category)):    
    for j in range(len(imdb)):
        for j in range(len(category[k][i])):
            if ids == imdb[i]:
                temp_cat.append(category[k][i][j])
            elif ids != imdb[i]:
                dict_cat[ids] = " | ".join(temp_cat)
                ids = imdb[i]
                temp_cat = []
                temp_cat.append(category[k][i][j])
        dict_cat[ids] = " | ".join(temp_cat)

In [232]:
country_code = [int(country_df.country_id.values[list(country_df.name.values).index(country[i][j])]) \
                for i in range(len(country)) for j in range(len(country[i]))]

In [29]:
if len(up_country) > 0:
    up_country.to_csv("update_db/up_country.csv")

## Update language_sql

In [233]:
lan_df = pd.read_csv('db_backup/language_db.csv')

In [234]:
code_lan = list(lan_df.language_id.values)
lan = list(lan_df.name.values)

In [235]:
max_language_id = max(code_lan)

In [236]:
update_lan = []
update_lan_code = []

In [237]:
for i in range(len(language)):
    for j in range(len(language[i])):
        if language[i][j] not in lan and language[i][j] != "":
            update_lan.append(language[i][j])
            update_lan_code.append(max_language_id + 1)
            max_language_id += 1

In [238]:
lan_data = {"language_id": update_lan_code, "name": update_lan}

In [239]:
up_language = pd.DataFrame(lan_data)

In [240]:
language_df = pd.concat([lan_df, up_language])

In [241]:
language_code = [int(language_df.language_id.values[list(language_df.name.values).index(language[i][j])]) \
                for i in range(len(language)) for j in range(len(language[i]))]

In [184]:
up_language

Unnamed: 0,language_id,name


In [70]:
if len(up_language) > 0:
    up_language.to_csv("update_db/up_language.csv")

## Creation of genre_code, country_code and language_code for data

In [327]:
genre_df = pd.read_csv("db_backup/genre_db.csv")
genre_code = [genre_df.genre_id.values[list(genre_df.name.values).index(genres[i][j])] \
            for i in range(len(genres)) for j in range(len(genres[i]))]

## Make change of currencies

In [15]:
changer = pd.read_excel("currency_change.xlsx")

In [17]:
changer = {"$":1, "€":1.17, "£":1.37}
def_budget = [changer[bud_currency[i]]*budget[i] if budget[i] != "" else "" for i in range(len(budget))]

KeyError: ''

## Update selection_film

#### List to string for directors, actors, genres, countries, language, production

In [39]:
dict_dir = {}
dict_cast = {}
dict_genre = {}
dict_country = {}
dict_language = {}
dict_prod = {}

In [40]:
temp_dir = []
temp_cast = []
temp_genre = []
temp_country = []
temp_language = []
temp_prod = []

In [41]:
ids = imdb[0]

In [42]:
for i in range(len(imdb)):
    for j in range(len(director[i])):
        if ids == imdb[i]:
            temp_dir.append(director[i][j]) 
        elif ids != imdb[i]:
            dict_dir[ids] = " | ".join(temp_dir)
            ids = imdb[i]
            temp_dir = []
            temp_dir.append(director[i][j])
    dict_dir[ids] = " | ".join(temp_dir)

In [43]:
director

[['Jan Belcl'],
 ['Stephan Streker'],
 ['Mike Mills'],
 ['Sophie Dupuis'],
 ['Laura Wandel'],
 ['Thomas Paulot'],
 ['Bernard Campan', 'Alexandre Jollien'],
 ['Edouard Baer'],
 ['Thomas Kruithof'],
 ['Mehrdad Oskouei'],
 ['Sandrine Kiberlain'],
 ['Andreas Koefoed'],
 ['Markos Gastin'],
 ['Rithy Panh'],
 ['Kenji Nagasaki']]

In [44]:
ids = imdb[0]

In [45]:
for i in range(len(imdb)):
    for j in range(len(actor[i])):
        if ids == imdb[i]:
            temp_cast.append(actor[i][j])
        elif ids != imdb[i]:
            dict_cast[ids] = " | ".join(temp_cast)
            ids = imdb[i]
            temp_cast = []
            temp_cast.append(actor[i][j])
    dict_cast[ids] = " | ".join(temp_cast)

In [46]:
ids = imdb[0]

In [47]:
for i in range(len(imdb)):
    for j in range(len(genres[i])):
        if ids == imdb[i]:
            temp_genre.append(genres[i][j])
        elif ids != imdb[i]:
            dict_genre[ids] = " | ".join(temp_genre)
            ids = imdb[i]
            temp_genre = []
            temp_genre.append(genres[i][j])
    dict_genre[ids] = " | ".join(temp_genre)

In [48]:
ids = imdb[0]

In [49]:
country

[['Poland'],
 ['Belgium', 'France', 'Luxembourg'],
 ['USA'],
 ['Canada'],
 ['Belgium'],
 ['France'],
 ['France', 'Switzerland'],
 ['France', 'Belgium'],
 ['France'],
 ['Iran', 'Norway'],
 ['France'],
 ['Denmark', 'France'],
 ['Greece', 'France'],
 ['Cambodia', 'France'],
 ['Japan']]

In [50]:
for i in range(len(imdb)):
    for j in range(len(country[i])):
        if ids == imdb[i]:
            temp_country.append(country[i][j])
        elif ids != imdb[i]:
            dict_country[ids] = " | ".join(temp_country)
            ids = imdb[i]
            temp_country = []
            temp_country.append(country[i][j])
    dict_country[ids] = " | ".join(temp_country)

In [51]:
ids = imdb[0]

In [52]:
for i in range(len(imdb)):
    for j in range(len(language[i])):
        if ids == imdb[i]:
            temp_language.append(language[i][j])
        elif ids != imdb[i]:
            dict_language[ids] = " | ".join(temp_language)
            ids = imdb[i]
            temp_language = []
            temp_language.append(language[i][j])
    dict_language[ids] = " | ".join(temp_language)

In [53]:
ids = imdb[0]

In [54]:
for i in range(len(imdb)):
    for j in range(len(production[i])):
        if ids == imdb[i]:
            temp_prod.append(production[i][j])
        elif ids != imdb[i]:
            dict_prod[ids] = " | ".join(temp_prod)
            ids = imdb[i]
            temp_prod = []
            temp_prod.append(production[i][j])
    dict_prod[ids] = " | ".join(temp_prod)

#### Put data in a dataframe

In [56]:
select_data = {"imdb_id": imdb, "vu": saw, "à voir": saw, "title": title, "original_title": original_title, \
               "year": year, "director": list(dict_dir.values()), "casting": list(dict_cast.values()), \
               "genres": list(dict_genre.values()), "duration": duration, \
               "country": list(dict_country.values()), "language": list(dict_language.values()), \
               "production": list(dict_prod.values()), "synopsis": synopsis, "rating": rating, \
               "num_rate": num_rate}

In [57]:
select = pd.DataFrame(select_data).set_index("imdb_id")

In [58]:
select

Unnamed: 0_level_0,vu,à voir,title,original_title,year,director,casting,genres,duration,country,language,production,synopsis,rating,num_rate
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
tt13723064,,,Tous mes amis sont morts,Wszyscy moi przyjaciele nie zyja,2020,Jan Belcl,Michal Meyer | Adam Woronowicz | Julia Wieniaw...,Comedy | Crime | Drama,96,Poland,Polish | French,Aurum Film,During the New Year's Eve party of a bunch of ...,5.8,5921
tt8493970,,,L'ennemi,L'ennemi,2020,Stephan Streker,Jérémie Renier | Alma Jodorowsky | Emmanuelle ...,Crime | Drama,105,Belgium | France | Luxembourg,French | Flemish,Daylight Films | Formosa Productions | BAC Cin...,Prominent politician Louis Durieux is accused ...,6.7,81
tt10986222,,,Nos âmes d'enfants,C'mon C'mon,2021,Mike Mills,Joaquin Phoenix | Gaby Hoffmann | Woody Norman...,Drama,109,USA,English,A24 | Be Funny When You Can,When his sister asks him to look after her son...,7.6,12610
tt11956066,,,Souterrain,Souterrain,2020,Sophie Dupuis,Joakim Robillard | Jean-François Boudreau | Gu...,Drama,97,Canada,French,Bravo Charlie | Bord Cadre Films | SODEC | Sov...,Maxime is a miner with a troubled past. After ...,7.1,208
tt11087960,,,Un monde,Un monde,2021,Laura Wandel,Maya Vanderbeque | Günter Duret | Lena Girard ...,Drama,72,Belgium,French,Dragons Films | Lunanime | Centre du Cinéma et...,When Nora witnesses Abel being bullied by othe...,7.3,847
tt15067124,,,Municipale,Municipale,2021,Thomas Paulot,Laurent Papot | Milan Alfonsi | Ferdinand Flame,Drama,110,France,French,L'Heure d'été | Indéfilms 9 | Région Grand Est...,"The small town of Revin, in the French Ardenne...",7.0,8
tt13553662,,,Presque,Presque,2021,Bernard Campan | Alexandre Jollien,Bernard Campan | Alexandre Jollien | Tiphaine ...,Comedy,92,France | Switzerland,French,Pan Européenne | France 3 Cinéma | Apollo Film...,"Two men, why vastly different life stories and...",7.4,70
tt13492256,,,Adieu Paris,Adieu Paris,2021,Edouard Baer,Pierre Arditi | Jackie Berroyer | Gérard Dague...,Comedy,96,France | Belgium,French,Cinéfrance Studios | Les Productions en Cabine...,Eight men in an old Parisian bar. They were th...,5.2,34
tt13846472,,,Les promesses,Les promesses,2021,Thomas Kruithof,Isabelle Huppert | Reda Kateb | Naidra Ayadi |...,Drama,98,France,French,2425 Films | Wild Bunch | France 2 Cinéma | Le...,A story that revolves around Clemence who's in...,6.1,167
tt11299998,,,Sunless Shadows,Sunless Shadows,2019,Mehrdad Oskouei,,Documentary,74,Iran | Norway,Persian,Film E-Emrooz | Indie Film as | Oskouei Film P...,"In an Iranian juvenile detention center, a gro...",7.4,130


In [59]:
select.to_csv("movies_scraped.csv")

## Update movie_sql

In [60]:
movie_data = {"imdb_id": imdb, "title": title, "original_title": original_title, "year": year, \
               "duration": duration, "rating": rating, "num_rate": num_rate, "budget": def_budget, \
               "synopsis": synopsis}

In [61]:
up_movie = pd.DataFrame(movie_data).set_index("imdb_id")

In [62]:
up_movie.to_csv("update_sql/up_movies.csv")

## Update user_db

In [None]:
user_data = {"imdb_id": imdb, "saw": db_saw, "wishlist": db_saw}

In [None]:
up_user = pd.DataFrame(user_data).set_index("imdb_id")

In [None]:
up_user.to_csv("update_db/up_user.csv")

## Update actor_sql

In [63]:
actor_df = pd.read_csv('new_sql/actor_sql.csv')

In [64]:
code_actor = list(actor_df.code.values)

In [65]:
update_cast_code = []
update_cast = []

In [66]:
for i in range(len(cast_code)): 
    for j in range(len(cast_code[i])):
        if cast_code[i][j] not in code_actor:
            update_cast_code.append(cast_code[i][j])
            update_cast.append(cast[i][j])
            code_actor.append(cast_code[i][j])

In [67]:
actor_data = {"actor_id": update_cast_code, "name": update_cast}

In [68]:
up_actor = pd.DataFrame(actor_data)

In [69]:
up_actor = up_actor.drop(up_actor[up_actor["actor_id"] == ""].index).set_index("actor_id")

In [70]:
if len(update_cast) > 0:
    up_actor.to_csv("update_sql/up_actor.csv")

## Update director_sql

In [71]:
dir_df = pd.read_csv('new_sql/director_sql.csv')

In [72]:
code_dir = list(dir_df.code_name.values)

In [73]:
update_dir_code = []
update_dir = []

In [74]:
for i in range(len(dir_code)): 
    for j in range(len(dir_code[i])):
        if dir_code[i][j] not in code_dir:
            update_dir_code.append(dir_code[i][j])
            update_dir.append(director[i][j])
            code_dir.append(dir_code[i][j])

In [75]:
dir_data = {"director_id": update_dir_code, "name": update_dir}

In [76]:
up_dir = pd.DataFrame(dir_data)

In [77]:
up_dir = up_dir.drop(up_dir[up_dir["director_id"] == ""].index).set_index("director_id")

In [78]:
if len(update_dir) > 0:
    up_dir.to_csv("update_sql/up_dir.csv")

## Update production_sql

In [79]:
prod_df = pd.read_csv('new_sql/production_sql.csv')

In [80]:
code_prod = list(prod_df.prod_code.values)

In [81]:
update_prod_code = []
update_prod = []

In [82]:
for i in range(len(prod_code)): 
    for j in range(len(prod_code[i])):
        if prod_code[i][j] not in code_prod:
            update_prod_code.append(prod_code[i][j])
            update_prod.append(production[i][j])
            code_prod.append(prod_code[i][j])

In [83]:
prod_data = {"prod_id": update_prod_code, "name": update_prod}

In [84]:
up_prod = pd.DataFrame(prod_data)

In [85]:
up_prod = up_prod.drop(up_prod[up_prod["prod_id"] == ""].index).set_index("prod_id")

In [86]:
if len(update_prod) > 0:
    up_prod.to_csv("update_sql/up_prod.csv")

## Update movie_dir_sql

In [87]:
dir_imdb = [imdb[i] for i in range(len(imdb)) for j in range(len(dir_code[i]))]
all_dir = [dir_code[i][j] for i in range(len(imdb)) for j in range(len(dir_code[i]))]

In [88]:
movie_dir_data = {"movie_id": dir_imdb, "director_id": all_dir}

In [89]:
up_movie_dir = pd.DataFrame(movie_dir_data)

In [90]:
up_movie_dir = up_movie_dir.drop(up_movie_dir[up_movie_dir["director_id"] == ""].index).set_index("movie_id")

In [91]:
up_movie_dir.to_csv("update_sql/up_movie_dir.csv")

## Update movie_actor_sql

In [92]:
cast_imdb = [imdb[i] for i in range(len(imdb)) for j in range(len(cast_code[i]))]
all_actor = [cast_code[i][j] for i in range(len(imdb)) for j in range(len(cast_code[i]))]

In [93]:
movie_cast_data = {"movie_id": cast_imdb, "actor_id": all_actor}

In [94]:
up_movie_cast = pd.DataFrame(movie_cast_data)

In [95]:
up_movie_cast = up_movie_cast.drop(up_movie_cast[up_movie_cast["actor_id"] == ""].index).set_index("movie_id")

In [96]:
up_movie_cast.to_csv("update_sql/up_movie_cast.csv")

## Update movie_prod_sql

In [97]:
prod_imdb = [imdb[i] for i in range(len(imdb)) for j in range(len(prod_code[i]))]
all_prod = [prod_code[i][j] for i in range(len(imdb)) for j in range(len(prod_code[i]))]

In [98]:
movie_prod_data = {"movie_id": prod_imdb, "prod_id": all_prod}

In [99]:
up_movie_prod = pd.DataFrame(movie_prod_data)

In [100]:
up_movie_prod = up_movie_prod.drop(up_movie_prod[up_movie_prod["prod_id"] == ""].index).set_index("movie_id")

In [101]:
up_movie_prod.to_csv("update_sql/up_movie_prod.csv")

## Update movie_genre_sql

In [102]:
genre_imdb = [imdb[i] for i in range(len(imdb)) for j in range(len(genres[i]))]

In [103]:
movie_genre_data = {"movie_id": genre_imdb, "genre_id": genre_code}

In [104]:
up_movie_genre = pd.DataFrame(movie_genre_data)

In [105]:
up_movie_genre = up_movie_genre.drop(up_movie_genre[up_movie_genre["genre_id"] == ""].index).set_index("movie_id")

In [106]:
up_movie_genre.to_csv("update_sql/up_movie_genre.csv")

## Update movie_country_sql

In [475]:
country_imdb = [imdb[i] for i in range(len(imdb)) for j in range(len(country[i]))]

In [476]:
country_imdb

['tt6013920', 'tt6013920', 'tt11286314']

In [108]:
movie_country_data = {"movie_id": country_imdb, "country_id": country_code}

In [109]:
up_movie_country = pd.DataFrame(movie_country_data)

In [110]:
up_movie_country = up_movie_country.drop(up_movie_country[up_movie_country["country_id"] == ""].index).set_index("movie_id")

In [111]:
up_movie_country.to_csv("update_sql/up_movie_country.csv")

## Update movie_language_sql

In [112]:
language_imdb = [imdb[i] for i in range(len(imdb)) for j in range(len(language[i]))]

In [113]:
movie_language_data = {"movie_id": language_imdb, "language_id": language_code}

In [114]:
up_movie_language = pd.DataFrame(movie_language_data)

In [115]:
up_movie_language = up_movie_language.drop(up_movie_language[up_movie_language["language_id"] == ""].index).set_index("movie_id")

In [116]:
up_movie_language.to_csv("update_sql/up_movie_language.csv