### Search information of movies from IMDb
##### code based on https://goo.gl/Y1W3C8

In [None]:
import urllib.request, urllib.parse, urllib.error
import json
import csv
import pandas as pd
from itertools import islice
from time import sleep, time
from random import randint
import warnings

def print_json(json_data):
    list_keys=['imdbID', 'Title', 'Year', 'Released', 'Director', 'Actors']
    print("-"*50)
    for k in list_keys:
        if k in list(json_data.keys()):
            print(f"{k}: {json_data[k]}")
    print("-"*50)

def search_movie(title):
    with open('APIkeys.json') as f:
        keys = json.load(f)
        omdbapi = keys['OMDBapi']

    serviceurl = 'http://www.omdbapi.com/?'
    apikey = '&apikey='+omdbapi    

    try:
        url = serviceurl + urllib.parse.urlencode({'t': title})+apikey
        print(f'Retrieving the data of "{title}"')
        uh = urllib.request.urlopen(url)
        data = uh.read()
        json_data=json.loads(data)
        
        if json_data['Response']=='True':
            #print_json(json_data)
            return json_data
           
        else:
            print("Error encountered: ",json_data['Error'])
            return None
    
    except urllib.error.URLError as e:
        print(f"ERROR: {e.reason}")

if __name__ == '__main__':
    f = open("imdb/actedIn.csv", "w+")
    cols = ['yagoID','name','predicate', 'title', 'tempredicate', 'date']
    films_df = pd.read_csv('formatted/actedIn.csv', sep='\t', header=None, names=cols)

    # Preparing the monitoring of the loop
    start_time = time()
    requests = 0
    
    movies_not_found = 0
    # 1000 requests at a time due to API free access limitation. E.g., 0-1000, 1000-2000, etc.
    for index, row in islice(films_df.iterrows(), 0, 1000):
        if row['tempredicate'] != 'validOnDate': continue

        data = search_movie(row['title'])    
        if data is None: 
            movies_not_found += 1
            continue        
        row['imdbID'] = data.get('imdbID')
        row['imdbTitle'] = data.get('Title')
        row['imdbReleased'] = data.get('Released')
        row['imdbYear'] = data.get('Year')
        row['imdbActors'] = data.get('Actors')

        df_row = pd.DataFrame(row).T
        df_row.to_csv(f, index=False, sep='\t', header=None)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        print('\n')

        # Break the loop if the number of requests is greater than expected
        if requests > 1000:
            warn('Number of requests was greater than expected.')
            break

    print('Number of movies not found: {}'.format(movies_not_found))
    f.close()    