# Final project: Predict movies box office

1712811 - Lê Trung Tiến \
1712742 - Nguyễn Tấn Tài

In [21]:
import aiohttp
import asyncio
import json
import os
from aiohttp import ClientSession
import pandas as pd

## Collect data

In [22]:
# GET GENERAL DATA OF MOVIES AND WRITE TO FILE

# Keyword to find movies
KEYWORD = 'super'
# Default and constant movies per page = 10
NUMBER_OF_PAGES_TO_GET = 90

# Dataframe to save movie general details
data_short = pd.DataFrame(columns=['Title', 'Year', 'imdbID', 'Type', 'Poster'])

# We use async techique here
async def get_movie_short_async(page, session):
    url = f'http://www.omdbapi.com/?apikey=2889d713&s={KEYWORD}&page={page}&type=movie'
    try:
        response = await session.request(method='GET', url=url)
        response.raise_for_status()
#         print(f"Response status ({url}): {response.status}")
    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error ocurred: {err}")
    response_json = await response.json()
    global data_short
    data_short = data_short.append(response_json['Search'], ignore_index=True)
    pass

# Call the above function concurrently to save time
async with ClientSession() as session:
    await asyncio.gather(*[get_movie_short_async(page, session) for page in range(1, NUMBER_OF_PAGES_TO_GET + 1)])
    
data_short.head()
data_short.to_csv('data_short.csv', index=False)

Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=3&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=10&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=14&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=9&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=82&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=29&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=57&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=1&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=5&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=11&type=movie): 200
Response status (http://www.omdbapi.com/?apikey=2889d713&s=super&page=20&type=movie): 200
Response statu

In [23]:
# GET DETAILS OF EACH MOVIES

# Dataframe to save movie details
data_detail = pd.DataFrame(columns=["Title", "Year", "Rated", "Released", "Runtime", "Genre", "Director", "Writer", "Actors", "Plot", "Language", "Country", "Awards", "Poster", "Ratings", "Metascore", "imdbRating", "imdbVotes", "imdbID", "Type", "DVD", "BoxOffice", "Production", "Website", "Response"])

async def get_movie_detail_async(imdb_id, session):
    url = f'http://www.omdbapi.com/?apikey=2889d713&i={imdb_id}&type=movie'
    try:
        response = await session.request(method='GET', url=url)
        response.raise_for_status()
#         print(f"Response status ({url}): {response.status}")
    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error ocurred: {err}")
    response_json = await response.json()
    global data_detail
    data_detail = data_detail.append(response_json, ignore_index=True)
    pass

async with ClientSession() as session:
    await asyncio.gather(*[get_movie_detail_async(imdb_id, session) for imdb_id in data_short['imdbID']])
    
data_detail.head()
data_detail.to_csv('data_detail.csv', index=False)

## Question to ask
How much could the box office of a newly released film be?

## Discover data

### Read data from file

In [24]:
# Read data from file
data_df = pd.read_csv('data_detail.csv')
data_df.head()
# Number of rows and columns
data_df.shape

# Number of duplicated rows
data_df.duplicated().sum()

# Column meanings
# TODO

# Data types
data_df.dtypes

Title          object
Year            int64
Rated          object
Released       object
Runtime        object
Genre          object
Director       object
Writer         object
Actors         object
Plot           object
Language       object
Country        object
Awards         object
Poster         object
Ratings        object
Metascore     float64
imdbRating    float64
imdbVotes      object
imdbID         object
Type           object
DVD            object
BoxOffice      object
Production     object
Website       float64
Response         bool
dtype: object

### Useful columns to use

In [25]:
# As we see, the columns will be
useful_cols = ['Year', 'Rated', 'Runtime', 'Genre', 'Language', 'Country', 'Production']

# Why we dont use other columns:
# TODO


### Preprocess data

In [27]:
# As we see, the useful columns are all have invalid data type (Object). Now we convert it to number / category type

# Let see the categories of each columns:
for useful_col in useful_cols:
    print(useful_col)
    print(data_df[useful_col].unique())
    print()

Year
[2012 2017 1925 2001 2011 2007 2018 2019 2015 2010 2009 1996 1986 2013
 2004 1989 1998 1973 1964 1971 2005 1992 2016 2014 1994 1972 2020 2006
 1966 2008 1993 1985 1984 1937 1974 1978 2002 1991 1969 1979 2003 1995
 1999 1981 1980 2000 1983 1988 1949 1976 1987 1982 1970 1990 2022 1967
 1997 1975 1952 1977 1943 1960 2021 1934 1935 1947 1965]

Rated
[nan 'Not Rated' 'R' 'PG-13' 'TV-Y7' 'TV-Y7-FV' 'TV-MA' 'PG' 'TV-14'
 'TV-PG' 'G' 'Approved' 'NOT RATED' 'UNRATED' 'Unrated' '14+' 'TV-G'
 'M/12' 'PASSED' 'X' 'APPROVED']

Runtime
[nan '20 min' '92 min' '83 min' '120 min' '78 min' '4 min' '108 min'
 '49 min' '94 min' '93 min' '52 min' '77 min' '75 min' '106 min' '30 min'
 '87 min' '7 min' '1 h 59 min' '22 min' '84 min' '136 min' '86 min'
 '96 min' '133 min' '12 min' '91 min' '200 min' '89 min' '95 min' '64 min'
 '99 min' '5 min' '104 min' '60 min' '112 min' '100 min' '55 min'
 '110 min' '103 min' '70 min' '240 min' '1 h 58 min' '2 min' '85 min'
 '90 min' '98 min' '154 min' '1,325 min' '15 

In [30]:
# First of all, we can see that there is a column whose type could be category but we have to preprocess it a little.
# That column is: Genre
all_gernes_in_one_list = []
for gerne in data_df['Genre']:
    all_gernes_in_one_list.extend(gerne.split(','))
    
print(all_gernes_in_one_list)

# OK, we can see that the columns with category data type are


AttributeError: 'float' object has no attribute 'split'

In [None]:
# OK, hopefully the columns we choose dont have too much categories
# Now we calculate the percentage of every categories in each column

### Number of rows and columns