In [3]:
import requests, csv
from bs4 import BeautifulSoup
import re
import pandas as pd
import tqdm
import numpy as np
from collections import OrderedDict
from datetime import datetime, date

In [11]:
def movie_query(query, date_min = 1900, date_max = int(date.today().year)):
    
    # dictionary of characters to change in URL
    char_dict = {"!": "%21", "#": "%23", "$": "%24", "&": "%26", "'": "%27", "(": "%28", ")": "%29", "*": "%2A", \
             "+": "%2B", ",": "%2C", "/": "%2F", ":": "%3A", ";": "%3B", "=": "%3D", "?": "%3F", "@": "%40", \
             "[": "%5B", "]": "%5D", " ": "+"}
    
    query = query.strip()
    if " !" in query:
        query = query.replace(" !", "!") 
    elif " ?" in query:
        query = query.replace(" ?", "?")
    elif " :" in query:
        query = query.replace(" :", ":")
    elif "’" in query:
        query = query.replace("’", "'")
        
    modif_query = [char_dict[i] if i in char_dict.keys() else i for i in query]
    modif_query = "".join(modif_query)
    
    r = requests.get("https://www.imdb.com/find?q="+modif_query)
    bs = BeautifulSoup(r.text)

    titles = bs.find_all('td', {'class': 'result_text'})

    clean_titles= []
    dates = []
    try :
        for i in range(len(titles)) : 
            clean_titles.append(str(titles[i]).split('/">')[1].split('</a>')[0])
            if "(TV Episode)" in str(titles[i]):
                dates.append(1492)
            elif "(TV Series)" in str(titles[i]):
                dates.append(1492)
            elif "I)" in str(titles[i]):
                dates.append(str(titles[i]).split('I) (')[1].split(')')[0])
            elif "(IV)" in str(titles[i]):
                dates.append(str(titles[i]).split('(IV) (')[1].split(')')[0])
            elif "(V)" in str(titles[i]):
                dates.append(str(titles[i]).split('(V) (')[1].split(')')[0])
            elif "(in development)" in str(titles[i]):
                dates.append(date_max+1)
            else :
                dates.append(str(titles[i]).split('</a> (')[1].split(')')[0])
    except IndexError:
        clean_titles.append("no_title")
        dates.append(1492)
    
    range_date = range(int(date_min), int(date_max)+1)
    ids = [str(titles[i]).split('href="/title/')[1].split('/">')[0] for i in range(len(clean_titles)) \
        if clean_titles[i].lower() == query.lower() and int(dates[i]) in range_date]
        
    if len(ids) == 0 :
        imdb_id = ""    
    else :
        imdb_id = ", ".join(ids)
    
    return query, imdb_id

In [5]:
df = pd.read_csv("feb_movies.csv")

In [6]:
queries = list(df.title.values)

In [7]:
queries

['After Blue (Paradis sale)',
 'Un autre monde',
 'Maison de Retraite',
 'La Vraie famille',
 'Uncharted',
 'The Power',
 'Nous',
 'Hopper et le hamster des ténèbres',
 'Piccolo corpo',
 'King',
 'La Jungle est mon jardin',
 'La Mouette et le chat',
 'Les Affluents',
 'Media Crash - qui a tué le débat public ?',
 'Tueurs de dames',
 'La Nuit des femmes',
 'Maternité éternelle',
 "La Lune s'est levée",
 'Mademoiselle Ogin',
 'La Princesse errante',
 "Lettre d'amour",
 'Aaraattu',
 'Maigret',
 'Compagnons',
 'Ils sont vivants',
 'Le Chêne',
 'Selon La Police',
 'La Légende du roi crabe',
 'Les Poings desserrés',
 'Zaï Zaï Zaï Zaï',
 'Blacklight',
 'Le Parrain',
 'A nos enfants',
 'Sous le ciel de Koutaïssi',
 'Un peuple',
 'Valimai',
 'Visages de femmes',
 'Pas pareil et pourtant',
 'La Nature',
 "Les Graines que l'on sème",
 'Une blonde émoustillante',
 'Trains étroitement surveillés',
 'Alouettes, le fil à la patte',
 'Cluny Brown (La Folle ingénue)']

In [12]:
ids = []

In [13]:
for movie in (queries[:23]):
    ids.append(movie_query(movie, 2019, 2022))

In [14]:
ids

[('After Blue (Paradis sale)', 'tt10243676'),
 ('Un autre monde', 'tt15115102, tt9784586, tt13900840'),
 ('Maison de Retraite', 'tt12847680'),
 ('La Vraie famille', 'tt13282344'),
 ('Uncharted', 'tt1464335'),
 ('The Power', 'tt9314984'),
 ('Nous', 'tt14038600'),
 ('Hopper et le hamster des ténèbres', 'tt12532368'),
 ('Piccolo corpo', 'tt13926310'),
 ('King', 'tt14073780'),
 ('La Jungle est mon jardin', 'tt17093164'),
 ('La Mouette et le chat', ''),
 ('Les Affluents', 'tt12474932'),
 ('Media Crash - qui a tué le débat public?', ''),
 ('Tueurs de dames', ''),
 ('La Nuit des femmes', ''),
 ('Maternité éternelle', ''),
 ("La Lune s'est levée", ''),
 ('Mademoiselle Ogin', ''),
 ('La Princesse errante', ''),
 ("Lettre d'amour", ''),
 ('Aaraattu', 'tt13468976'),
 ('Maigret', '')]

In [15]:
tot_df = pd.DataFrame(ids[:22], columns=["title", 'imdb'])

In [16]:
tot_df

Unnamed: 0,title,imdb
0,After Blue (Paradis sale),tt10243676
1,Un autre monde,"tt15115102, tt9784586, tt13900840"
2,Maison de Retraite,tt12847680
3,La Vraie famille,tt13282344
4,Uncharted,tt1464335
5,The Power,tt9314984
6,Nous,tt14038600
7,Hopper et le hamster des ténèbres,tt12532368
8,Piccolo corpo,tt13926310
9,King,tt14073780


In [17]:
filter_1 = tot_df["imdb"] == ""
filter_2 = tot_df["imdb"] != ""

In [18]:
no_id_df = tot_df[filter_1].drop("imdb", 1).set_index("title")

  """Entry point for launching an IPython kernel.


In [19]:
multiple_id = tot_df[tot_df["imdb"].str.contains(", tt")].set_index("title")

In [20]:
scrap_df = tot_df[filter_2][-tot_df["imdb"].str.contains(", tt")].set_index("title")

  """Entry point for launching an IPython kernel.


In [21]:
no_id_df

La Mouette et le chat
Media Crash - qui a tué le débat public?
Tueurs de dames
La Nuit des femmes
Maternité éternelle
La Lune s'est levée
Mademoiselle Ogin
La Princesse errante
Lettre d'amour


In [22]:
multiple_id

Unnamed: 0_level_0,imdb
title,Unnamed: 1_level_1
Un autre monde,"tt15115102, tt9784586, tt13900840"


In [23]:
scrap_df

Unnamed: 0_level_0,imdb
title,Unnamed: 1_level_1
After Blue (Paradis sale),tt10243676
Maison de Retraite,tt12847680
La Vraie famille,tt13282344
Uncharted,tt1464335
The Power,tt9314984
Nous,tt14038600
Hopper et le hamster des ténèbres,tt12532368
Piccolo corpo,tt13926310
King,tt14073780
La Jungle est mon jardin,tt17093164


In [24]:
good = pd.read_csv("scrap_it.csv").set_index("title")

In [25]:
good

Unnamed: 0_level_0,imdb
title,Unnamed: 1_level_1
Vengeance aux poings,tt14158554


In [26]:
scrap_df = pd.concat([good, scrap_df])

In [27]:
scrap_df.to_csv("scrap_it.csv")

In [28]:
nope = pd.read_csv("future_scrap.csv").set_index("title")

In [29]:
no_id_df = pd.concat([nope, no_id_df])

In [30]:
no_id_df.to_csv("future_scrap.csv")

In [31]:
bad = pd.read_csv("multiple_id.csv").set_index("title")

In [32]:
multiple_id = pd.concat([bad, multiple_id])

In [33]:
multiple_id.to_csv("multiple_id.csv")