In [23]:
import requests
import pandas as pd
from tmdb_apikey import api_key
from bs4 import BeautifulSoup

In [24]:
# gather the movie titles and movie IDs using the API. The IDs give the full details of the movie.
movies = []
movies_id = []
for i in range(13):
    response = requests.get('https://api.themoviedb.org/3/movie/top_rated?', params={'api_key': api_key, 'page': i+1})
    r = response.json()
    for film in r['results']:
        movies.append(film['title'])
        movies_id.append(film['id'])
        if len(movies) == 250:
            break
        if len(movies_id) == 250:
            break

In [25]:
response = requests.get('https://api.themoviedb.org/3/movie/' + str(movies_id[1]), params={'api_key': api_key})
response.json()

{'adult': False,
 'backdrop_path': '/wPU78OPN4BYEgWYdXyg0phMee64.jpg',
 'belongs_to_collection': None,
 'budget': 25000000,
 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}],
 'homepage': '',
 'id': 278,
 'imdb_id': 'tt0111161',
 'original_language': 'en',
 'original_title': 'The Shawshank Redemption',
 'overview': 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
 'popularity': 77.331,
 'poster_path': '/hBcY0fE9pfXzvVaY4GKarweriG2.jpg',
 'production_companies': [{'id': 97,
   'logo_path': '/7znWcbDd4PcJzJUlJxYqAlPPykp.png',
   'name': 'Castle Rock Entertainment',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1

In [26]:
# collect data for all relevant parameters for our analysis
movies_revenue = []
ratings = []
release_dates = []
budget = []
genres = []
source = []
runtime = []
imdb_id = []
users_rated = []
for i in range(250):
    response = requests.get('https://api.themoviedb.org/3/movie/' + str(movies_id[i]), params={'api_key': api_key})
    r = response.json()
    ratings.append(round(r['vote_average']*10))
    release_dates.append(r['release_date'])
    runtime.append(r['runtime'])
    list_genres = []
    imdb_id.append(r['imdb_id'])
    users_rated.append(r['vote_count'])
    for i in range(len(r['genres'])):
        if r['genres'][i]['name'] == 'Science Fiction':
            list_genres.append('Sci-Fi')
        else:
            list_genres.append(r['genres'][i]['name'])
    # many films have more than one genre, some have just one. For those films with more than one, they are put in a list, then place in the list of genres. For those films with one genre, they were just put in the list of genres
    if len(list_genres)>1:
        genres.append(list_genres)
    elif len(list_genres)==1:
        genres.append(str(list_genres).strip("['']"))
    if r['budget']>0:
        budget.append(r['budget'])
    else:
        budget.append(None)
    if r['revenue']>0:
        movies_revenue.append(r['revenue'])
    else:
        movies_revenue.append(None)
    source.append('TMDB')

In [27]:
directors = []
for i in range(250):
    response = requests.get('https://api.themoviedb.org/3/movie/' + str(movies_id[i]) + '/credits?', params={'api_key':api_key})
    r = response.json()
    list_directors = []
    for d in r['crew']:
        if d['job'] == 'Director':
            list_directors.append(d['name'])
    directors.append(list_directors)
    if len(directors[i])==1:
        directors[i] = str(directors[i]).strip("['']")
# those films with more than one director were placed in a list, those films with just one director were just placed in the list of directors

In [28]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

In [29]:
# Wherever possible, replace missing revenue values with the values found in IMDb for the movie
for i in range(250):
    if movies_revenue[i] == None:
        response = requests.get('https://www.imdb.com/title/' + str(imdb_id[i]), headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        try:
            s = soup.find('li', attrs={'class':'ipc-metadata-list__item sc-6d4f3f8c-2 byhjlB', 'role':"presentation", 'data-testid':"title-boxoffice-cumulativeworldwidegross"})
            ms = (s.get_text()).replace('Gross worldwide', '')
            movies_revenue[i] = int((ms.replace(',', ''))[1:])
        except:
            continue
movies_revenue


[245066411,
 28341469,
 2028077,
 102600000,
 321365567,
 100000000,
 None,
 274925095,
 1000000,
 357986087,
 257591776,
 1004558444,
 286801374,
 214179088,
 25253887,
 677387716,
 1385351,
 1118888979,
 None,
 46835000,
 None,
 11990401,
 346300,
 230098753,
 None,
 5472914,
 516962,
 32052925,
 100853753,
 None,
 197251,
 48045728,
 108981275,
 30641770,
 375540831,
 236049757,
 None,
 30156463,
 871368364,
 538400000,
 701729206,
 15222,
 108099,
 120072577,
 13092000,
 926287400,
 4903192,
 None,
 5000000,
 461000000,
 327311859,
 20347513,
 825532764,
 20000000,
 37034514,
 23875127,
 11000000,
 None,
 55240,
 272742922,
 4250000,
 159414369,
 45284974,
 235860116,
 1315376,
 46808,
 463618,
 381109762,
 36368,
 20339624,
 1266570,
 87212911,
 10195760,
 92600000,
 92300000,
 None,
 1488732821,
 None,
 5380118,
 150000000,
 33187,
 168817,
 5252,
 None,
 112536,
 503063688,
 930363,
 43862,
 21057208,
 426588510,
 None,
 19289062,
 2799439100,
 3494070,
 15194593,
 108967,
 2046

In [38]:
year_releases = []
for i in range(250):
    year_releases.append(release_dates[i][:4])


In [39]:
year_releases

['1972',
 '1994',
 '2022',
 '1974',
 '1993',
 '1995',
 '2022',
 '2001',
 '1957',
 '2016',
 '2019',
 '2008',
 '1999',
 '1994',
 '1966',
 '1994',
 '2016',
 '2003',
 '2020',
 '1990',
 '2019',
 '1988',
 '1954',
 '1997',
 '2020',
 '1984',
 '1988',
 '1960',
 '1999',
 '2020',
 '2021',
 '2019',
 '1975',
 '2002',
 '2018',
 '2004',
 '2011',
 '2016',
 '2001',
 '1980',
 '2014',
 '1962',
 '2020',
 '2002',
 '2014',
 '2002',
 '2000',
 '2022',
 '1950',
 '2022',
 '1995',
 '2018',
 '2010',
 '1997',
 '1954',
 '1998',
 '1940',
 '2013',
 '1952',
 '1991',
 '1931',
 '1997',
 '1994',
 '1989',
 '2019',
 '1963',
 '1936',
 '1985',
 '1940',
 '2020',
 '2020',
 '2021',
 '2019',
 '2019',
 '2021',
 '2018',
 '2022',
 '2020',
 '1968',
 '1979',
 '1960',
 '1985',
 '1957',
 '1964',
 '1997',
 '2020',
 '2020',
 '2020',
 '1998',
 '2011',
 '2020',
 '2019',
 '2019',
 '2014',
 '2003',
 '2022',
 '2018',
 '2019',
 '2019',
 '2019',
 '1946',
 '1966',
 '1994',
 '2016',
 '1991',
 '1974',
 '2019',
 '2018',
 '1994',
 '2012',
 '2020',
 

In [40]:
data = {'movie': movies[:250], 'director': directors[:250], 'genre': genres[:250], 'rating': ratings[:250], 'box office': movies_revenue[:250], 'release date': release_dates[:250], 'length': runtime[:250], 'data source': source[:250], 'users voted': users_rated, 'release year': year_releases}

In [41]:
df = pd.DataFrame(data)
df.to_csv('tmdb_raw_data.csv')
df

Unnamed: 0,movie,director,genre,rating,box office,release date,length,data source,users voted,release year
0,The Godfather,Francis Ford Coppola,"[Drama, Crime]",87,245066411.0,1972-03-14,175,TMDB,17604,1972
1,The Shawshank Redemption,Frank Darabont,"[Drama, Crime]",87,28341469.0,1994-09-23,142,TMDB,23443,1994
2,Cuando Sea Joven,Raúl Martínez,"[Comedy, Fantasy]",87,2028077.0,2022-09-14,115,TMDB,211,2022
3,The Godfather Part II,Francis Ford Coppola,"[Drama, Crime]",86,102600000.0,1974-12-20,202,TMDB,10662,1974
4,Schindler's List,Steven Spielberg,"[Drama, History, War]",86,321365567.0,1993-12-15,195,TMDB,13865,1993
...,...,...,...,...,...,...,...,...,...,...
245,Rocco and His Brothers,Luchino Visconti,"[Crime, Drama]",81,11328.0,1960-10-06,178,TMDB,469,1960
246,Jojo Rabbit,Taika Waititi,"[Comedy, War, Drama]",81,82468705.0,2019-10-18,108,TMDB,8261,2019
247,"Love, Simon",Greg Berlanti,"[Comedy, Drama, Romance]",81,66316289.0,2018-02-16,110,TMDB,5612,2018
248,Hidden Figures,Theodore Melfi,"[Drama, History]",81,235956898.0,2016-12-10,127,TMDB,8494,2016


In [42]:
# separate films by director (some films have more than one director, so this dataframe separate those lists)
df_2 = df.explode('director')
df_2.to_csv('tmb_dir_sep.csv')
df_2

Unnamed: 0,movie,director,genre,rating,box office,release date,length,data source,users voted,release year
0,The Godfather,Francis Ford Coppola,"[Drama, Crime]",87,245066411.0,1972-03-14,175,TMDB,17604,1972
1,The Shawshank Redemption,Frank Darabont,"[Drama, Crime]",87,28341469.0,1994-09-23,142,TMDB,23443,1994
2,Cuando Sea Joven,Raúl Martínez,"[Comedy, Fantasy]",87,2028077.0,2022-09-14,115,TMDB,211,2022
3,The Godfather Part II,Francis Ford Coppola,"[Drama, Crime]",86,102600000.0,1974-12-20,202,TMDB,10662,1974
4,Schindler's List,Steven Spielberg,"[Drama, History, War]",86,321365567.0,1993-12-15,195,TMDB,13865,1993
...,...,...,...,...,...,...,...,...,...,...
245,Rocco and His Brothers,Luchino Visconti,"[Crime, Drama]",81,11328.0,1960-10-06,178,TMDB,469,1960
246,Jojo Rabbit,Taika Waititi,"[Comedy, War, Drama]",81,82468705.0,2019-10-18,108,TMDB,8261,2019
247,"Love, Simon",Greg Berlanti,"[Comedy, Drama, Romance]",81,66316289.0,2018-02-16,110,TMDB,5612,2018
248,Hidden Figures,Theodore Melfi,"[Drama, History]",81,235956898.0,2016-12-10,127,TMDB,8494,2016


In [43]:
df_3 = df.explode('genre')
df_3.to_csv('tmdb_genre_sep.csv')
df_3

Unnamed: 0,movie,director,genre,rating,box office,release date,length,data source,users voted,release year
0,The Godfather,Francis Ford Coppola,Drama,87,245066411.0,1972-03-14,175,TMDB,17604,1972
0,The Godfather,Francis Ford Coppola,Crime,87,245066411.0,1972-03-14,175,TMDB,17604,1972
1,The Shawshank Redemption,Frank Darabont,Drama,87,28341469.0,1994-09-23,142,TMDB,23443,1994
1,The Shawshank Redemption,Frank Darabont,Crime,87,28341469.0,1994-09-23,142,TMDB,23443,1994
2,Cuando Sea Joven,Raúl Martínez,Comedy,87,2028077.0,2022-09-14,115,TMDB,211,2022
...,...,...,...,...,...,...,...,...,...,...
247,"Love, Simon",Greg Berlanti,Romance,81,66316289.0,2018-02-16,110,TMDB,5612,2018
248,Hidden Figures,Theodore Melfi,Drama,81,235956898.0,2016-12-10,127,TMDB,8494,2016
248,Hidden Figures,Theodore Melfi,History,81,235956898.0,2016-12-10,127,TMDB,8494,2016
249,The Elephant Man,David Lynch,Drama,81,26010864.0,1980-10-09,124,TMDB,2966,1980


In [44]:
df_4 = df_2.explode('genre')
df_4
df_4.to_csv('tmdb_gendir_sep.csv')

In [35]:
df_i = df_2.sort_values('director')
(df_i.explode('genre')).to_csv('tmdb_gendir_sep.csv')

In [36]:
df_i

Unnamed: 0,movie,director,genre,rating,box office,release date,length,data source,users voted
5,Dilwale Dulhania Le Jayenge,Aditya Chopra,"[Comedy, Drama, Romance]",86,100000000.0,1995-10-19,190,TMDB,4080
210,Red Beard,Akira Kurosawa,Drama,81,46808.0,1965-04-03,185,TMDB,272
65,High and Low,Akira Kurosawa,"[Crime, Drama, Mystery, Thriller]",83,46808.0,1963-03-01,142,TMDB,680
186,Yojimbo,Akira Kurosawa,"[Drama, Thriller]",81,46808.0,1961-04-25,110,TMDB,1177
22,Seven Samurai,Akira Kurosawa,"[Action, Drama]",85,346300.0,1954-04-26,207,TMDB,2981
...,...,...,...,...,...,...,...,...,...
93,Mommy,Xavier Dolan,Drama,83,3494070.0,2014-09-19,138,TMDB,2449
114,Tokyo Story,Yasujirō Ozu,Drama,82,37880.0,1953-11-03,137,TMDB,856
124,Zack Snyder's Justice League,Zack Snyder,"[Action, Adventure, Fantasy, Sci-Fi]",82,,2021-03-18,242,TMDB,8669
223,New Gods: Nezha Reborn,Zhao Ji,"[Animation, Action, Fantasy]",81,56088478.0,2021-02-06,116,TMDB,339
