In [1]:
import requests, csv
from bs4 import BeautifulSoup
import re
import pandas as pd
import tqdm
import numpy as np
from collections import OrderedDict
from datetime import datetime, date

In [2]:
def movie_query(query, date_min = 1900, date_max = int(date.today().year)):
    
    # dictionary of characters to change in URL
    char_dict = {"!": "%21", "#": "%23", "$": "%24", "&": "%26", "'": "%27", "(": "%28", ")": "%29", "*": "%2A", \
             "+": "%2B", ",": "%2C", "/": "%2F", ":": "%3A", ";": "%3B", "=": "%3D", "?": "%3F", "@": "%40", \
             "[": "%5B", "]": "%5D", " ": "+"}
    
    query = query.strip()
    if " !" in query:
        query = query.replace(" !", "!") 
    elif " ?" in query:
        query = query.replace(" ?", "?")
    elif " :" in query:
        query = query.replace(" :", ":")
    elif "’" in query:
        query = query.replace("’", "'")
        
    modif_query = [char_dict[i] if i in char_dict.keys() else i for i in query]
    modif_query = "".join(modif_query)
    
    r = requests.get("https://www.imdb.com/find?q="+modif_query)
    bs = BeautifulSoup(r.text)

    titles = bs.find_all('td', {'class': 'result_text'})

    clean_titles= []
    dates = []
    try :
        for i in range(len(titles)) : 
            clean_titles.append(str(titles[i]).split('/">')[1].split('</a>')[0])
            if "(TV Episode)" in str(titles[i]):
                dates.append(1492)
            elif "I)" in str(titles[i]):
                dates.append(str(titles[i]).split('I) (')[1].split(')')[0])
            elif "(IV)" in str(titles[i]):
                dates.append(str(titles[i]).split('(IV) (')[1].split(')')[0])
            elif "(V)" in str(titles[i]):
                dates.append(str(titles[i]).split('(V) (')[1].split(')')[0])
            elif "(in development)" in str(titles[i]):
                dates.append(date_max+1)
            else :
                dates.append(str(titles[i]).split('</a> (')[1].split(')')[0])
    except IndexError:
        clean_titles.append("no_title")
        dates.append(1492)
    
    range_date = range(int(date_min), int(date_max)+1)
    ids = [str(titles[i]).split('href="/title/')[1].split('/">')[0] for i in range(len(clean_titles)) \
        if clean_titles[i].lower() == query.lower() and int(dates[i]) in range_date]
        
    if len(ids) == 0 :
        imdb_id = ""    
    else :
        imdb_id = ", ".join(ids)
    
    return query, imdb_id

In [3]:
df = pd.read_csv("jan_movies.csv")

In [4]:
queries = list(df.title.values)

In [10]:
queries[:24]

['En Attendant Bojangles',
 '355',
 'Licorice Pizza',
 'Twist À Bamako',
 'Mes frères et moi',
 'Rosy',
 'Luzzu',
 'Residue',
 'Sword Art Online - Progressive - Aria of a Starless Night',
 'Marché noir',
 "J'étais à la maison, mais...",
 'Neige',
 'Guanzhou, une nouvelle ère',
 'Tous les garçons et les filles',
 'Traverser',
 'Le Messager',
 "Une affaire de coeur : La tragédie d'une employée des P.T.T.",
 'RRR',
 'Make Me a Man',
 'Mai 68 au masculin',
 "HK, la plume et l'espoir",
 "L'Homme n'est pas un oiseau",
 'Innocence sans protection',
 'Europe 51']

In [11]:
ids = []

In [12]:
for movie in (queries[:24]):
    ids.append(movie_query(movie, 2020, 2022))

In [13]:
ids

[('En Attendant Bojangles', 'tt11310608'),
 ('355', 'tt8356942'),
 ('Licorice Pizza', 'tt11271038'),
 ('Twist À Bamako', 'tt14055038'),
 ('Mes frères et moi', 'tt13582752'),
 ('Rosy', 'tt15153914'),
 ('Luzzu', ''),
 ('Residue', 'tt11568878'),
 ('Sword Art Online - Progressive - Aria of a Starless Night', 'tt13424422'),
 ('Marché noir', 'tt12758660'),
 ("J'étais à la maison, mais...", ''),
 ('Neige', ''),
 ('Guanzhou, une nouvelle ère', ''),
 ('Tous les garçons et les filles', ''),
 ('Traverser', 'tt15550372'),
 ('Le Messager', ''),
 ("Une affaire de coeur: La tragédie d'une employée des P.T.T.", ''),
 ('RRR', 'tt8178634, tt15371730'),
 ('Make Me a Man', ''),
 ('Mai 68 au masculin', 'tt16676456'),
 ("HK, la plume et l'espoir", 'tt16676444'),
 ("L'Homme n'est pas un oiseau", ''),
 ('Innocence sans protection', ''),
 ('Europe 51', '')]

In [14]:
tot_df = pd.DataFrame(ids[:17], columns=["title", 'imdb'])

In [15]:
tot_df

Unnamed: 0,title,imdb
0,En Attendant Bojangles,tt11310608
1,355,tt8356942
2,Licorice Pizza,tt11271038
3,Twist À Bamako,tt14055038
4,Mes frères et moi,tt13582752
5,Rosy,tt15153914
6,Luzzu,
7,Residue,tt11568878
8,Sword Art Online - Progressive - Aria of a Sta...,tt13424422
9,Marché noir,tt12758660


In [16]:
filter_1 = tot_df["imdb"] == ""
filter_2 = tot_df["imdb"] != ""

In [17]:
no_id_df = tot_df[filter_1].drop("imdb", 1).set_index("title")

In [18]:
multiple_id = tot_df[tot_df["imdb"].str.contains(", tt")].set_index("title")

In [19]:
scrap_df = tot_df[filter_2][-tot_df["imdb"].str.contains(", tt")].set_index("title")

  """Entry point for launching an IPython kernel.


In [20]:
no_id_df

Luzzu
"J'étais à la maison, mais..."
Neige
"Guanzhou, une nouvelle ère"
Tous les garçons et les filles
Le Messager
Une affaire de coeur: La tragédie d'une employée des P.T.T.


In [16]:
multiple_id

Unnamed: 0_level_0,imdb
title,Unnamed: 1_level_1


In [17]:
scrap_df

Unnamed: 0_level_0,imdb
title,Unnamed: 1_level_1
Tous en scène 2,tt6467266
Matrix Resurrections,tt10838180
Mince alors 2!,tt14017174
Madeleine Collins,tt11310670
C'est toi que j'attendais,tt12976392
My Kid,tt8135564
White Building,tt11813228
La Croisade,tt13846352
The Cloud in Her Room,tt9628568
L'Odyssée antarctique,tt16579354


In [24]:
good = pd.read_csv("scrap_it.csv").set_index("title")

In [25]:
good

Unnamed: 0_level_0,imdb
title,Unnamed: 1_level_1


In [26]:
scrap_df = pd.concat([good, scrap_df])

In [27]:
scrap_df.to_csv("scrap_it.csv")

In [28]:
nope = pd.read_csv("future_scrap.csv").set_index("title")

In [29]:
no_id_df = pd.concat([nope, no_id_df])

In [30]:
no_id_df.to_csv("future_scrap.csv")

In [31]:
bad = pd.read_csv("multiple_id.csv").set_index("title")

In [32]:
multiple_id = pd.concat([bad, multiple_id])

In [33]:
multiple_id.to_csv("multiple_id.csv")