# Augmenting The Wikipedia Plot Summary Dataset

We start off the data collection and cleaning pipeline with two datasets from Wikipedia. In order to wrangle this data, I decided to use Python, more specificially, the CSV and Pandas modules.

In [None]:
import pandas as pd
import csv
import os
import datetime
import requests
from time import sleep

Since a lot of the data was missing from the Wikipedia dataset, I decided to augment this dataset with more data from the OMDB API.

In [None]:
API_URL = "http://www.omdbapi.com"
API_KEY = "d8a2854c"

This simple utility function makes the request to the OMDB API and returns the data in JSON format. 

In [None]:
def ping_api(movie_name, year=None, api_key=API_KEY):

    request_params = {'apikey': api_key, 'type': 'movie', 't': movie_name, 'plot': 'full'}

    if year:
        request_params['y'] = year

    r = requests.get(API_URL, params=request_params)

    return r.json()

I read in the dataframes and concatenate them into one big dataframe, so that we can iterate over it using the CSV module.

In [None]:
# Concatenate the dataframes into one big dataset.
df = pd.concat([pd.read_csv("datasets/wiki_dataset_1.csv", index_col=0),
                pd.read_csv("datasets/wiki_dataset_2.csv", index_col=0)], ignore_index=True)

# Create a CSV that I can use to read the data.
df.to_csv("intermediate.csv")

imputation_targets = ("Unknown", "N/A", "unknown", "")

## Main Pipeline

1. Open the concatenated dataframe, as well as the final CSV we will need to write to.
2. Add the columns for the supplementary data.
3. Query the API with the title of the movie found in each row, and if possible, the year it was produced.
4. If the movie wasn't found on the database, continue with the next row.
5. If we've reached the request limit on this API key, then sleep for 24 hours and continue the cycle.
6. At this point, we have a valid JSON response from the API, so impute the fields with whatever it returns, and add the additional data to the appropriate columns.
7. If there are any issues with any particular row, then continue with the next row. 

In [None]:
# Imputes missing data from the wikipedia plot summary dataset with OMDB-scraped data, and supplements it with
# extra data.
with open("intermediate.csv") as read_file, open("final.csv", 'a') as write_file:

    reader = csv.DictReader(read_file)

    supplementary_data = ["Runtime", "imdbRating", "imdbVotes", "Rated", "Rotten Tomatoes", "Metacritic"]

    writer = csv.DictWriter(write_file, fieldnames=df.columns.tolist() + supplementary_data)
    writer.writeheader()
    movie_json = {}

    for row in reader:

        row = dict(row)

        try:
            # Ping the API
            movie_json = ping_api(row["Title"],
                                  year=row["Release Year"] if row["Release Year"] not in imputation_targets else None,
                                  api_key=API_KEY)

            # If the movie doesn't exist in the OMDB database, then skip this row, since we can't add any extra data
            # to it.
            if movie_json["Response"] == "False":

                # Repeat the queries
                if movie_json["Error"] == "Request limit reached!":
                    print("Timed out: JSON response is: {}".format(movie_json))

                    sleep(3600 * 24)

                elif movie_json["Error"] == "Movie not found!":
                    print("Not found: JSON response is: {}".format(movie_json))

                print("Empty API response for {}. ".format(row["Title"]))

                continue

            print("{}:\t\tImputing data for {}.".format(str(datetime.datetime.now()), row["Title"]))

            # If any of these fields are imputable, impute them.
            if row["Director"] in imputation_targets:
                row["Director"] = movie_json["Director"] if movie_json["Director"] != "N/A" else ""

            if row["Release Year"] in imputation_targets:
                row["Release Year"] = movie_json["Year"]

            if row["Genre"] in imputation_targets:
                row["Genre"] = movie_json["Genre"]

            if row["Cast"] in imputation_targets:
                row["Cast"] = movie_json["Actors"] if movie_json["Actors"] != "N/A" else ""

            if row["Plot"] in imputation_targets:
                row["Plot"] = movie_json["Plot"]

            # Supplement with additional data
            row["imdbRating"] = movie_json["imdbRating"] if movie_json["imdbRating"] != "N/A" else ""
            row["imdbVotes"] = movie_json["imdbVotes"] if movie_json["imdbVotes"] != "N/A" else ""
            row["Rated"] = movie_json["Rated"] if movie_json["Rated"] not in ("N/A", "NOT RATED", "PASSED", "UNRATED", "APPROVED") else ""

            # Go through the ratings list, and add the appropriate fields.
            if "Ratings" in movie_json:
                for rating in movie_json["Ratings"]:
                    row[rating["Source"]] = rating["Value"] if rating["Source"] in supplementary_data else ""

            writer.writerow({key: val for key, val in row.items() if key != ""})

        except KeyError as e:

            print("Movie JSON: {}".format(movie_json))
            print("Row: {}".format(row) + "\n")
            print("{}:\t\tRan into a key error for row {}.".format(str(datetime.datetime.now()), row["Title"]))

        except (KeyboardInterrupt, Exception) as e:
            raise e
            
            print("Writing files ...")
            break

After imputing the missing data, and augmenting the dataset with the Rotten Tomatoes and Metacritic data, write it out to a CSV file, and split it back up. I split up the data because my computer doesn't allow me to open incredibly large files, so I look at each one separately.

In [None]:
df = pd.read_csv("final.csv", index_col=0)

# Write out the new dataframe to two CSVs.
df[:int(len(df) / 2)].to_csv("souped_up_movie_dataset_1.csv")

df[int(len(df) / 2):].to_csv("souped_up_movie_dataset_2.csv")

if os.path.isfile("intermediate.csv"):
    os.remove("intermediate.csv")