### Load Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import tensorflow as tf
import os
import time
import scikitplot as skplt
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

In [1]:
from pandas.errors import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore",category=(SettingWithCopyWarning))
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")
warnings.simplefilter(action='ignore', category=FutureWarning)

### Load necessary files

In [None]:
filename = os.path.join(os.path.dirname(__name__), "RawDataFiles/films_to_scrape_sample.csv")
script_df = pd.read_csv(filename, delimiter=',',low_memory=False)
print("script_df: ",len(script_df.tconst.unique()))

filename = os.path.join(os.path.dirname(__name__), "RawDataFiles/film_plots.tsv")
plots_df = pd.read_csv(filename, delimiter='\t', low_memory=False)
print("plot_df: ",len(plots_df.film_id.unique()))

filename = os.path.join(os.path.dirname(__name__), "RawDataFiles/film_locations.tsv")
locations_df = pd.read_csv(filename, delimiter='\t', low_memory=False)
print("locations_df: ",len(locations_df.film_id.unique()))

filename = os.path.join(os.path.dirname(__name__), "RawDataFiles/films_base_details.tsv")
base_df = pd.read_csv(filename, delimiter='\t', low_memory=False)
print("base_df: ",len(base_df.film_id.unique()))

filename = os.path.join(os.path.dirname(__name__), "RawDataFiles/films_box_office.tsv")
box_df = pd.read_csv(filename, delimiter='\t', low_memory=False)
print("box_df: ",len(box_df.film_id.unique()))

filename = os.path.join(os.path.dirname(__name__), "imdb_scraper/Database/title.basics.tsv")
df4 = pd.read_csv(filename,delimiter='\t', low_memory=False)
print("title.basics: ",len(df4.tconst.unique()))


### Drop Extra Films

In [None]:
db_list = {'plots summary':plots_df,'locations':locations_df,'base details':base_df, 'box office':box_df}

In [None]:
unwanted_movies = ['tt15477076', 'tt0245936', 'tt15039908', 'tt0118656', 'tt0463670',
                  'tt1606306', 'tt1298820', 'tt21441286', 'tt0487456', 'tt2536522', 'tt15698592', 'tt5910170','tt0081299']

def remove_extra(df, unwanted_movies):
    indexfilms = df[df['film_id'].isin(unwanted_movies)].index
    df.drop(indexfilms, inplace=True)

for key, value in db_list.items():
    remove_extra(value, unwanted_movies)

### Check and remove NULL values

In [None]:
def print_num_of_films(df):
    if 'tconst' in df:
        x = len(df.tconst.unique())
        #film_list = df.tconst.unique().tolist()
    else:
        x = len(df.film_id.unique())
        #film_list = df.film_id.unique().tolist()
    return x

def calculate_null_data(df, item):
    df[item] = np.where(df[item].isnull(), 'None', df[item])
    Null_data = df[df[item]=='None']
    return print_num_of_films(Null_data)

null_dist = {'plots summary':0,'locations':0,'base details':0, 'box office':0}
feature_item = ['summary','locations','detail_result','detail_result']

print("Total Films: ", print_num_of_films(script_df))
print("\tNull Dataset sizes: ")
i=0
for key, value in db_list.items():
    null_dist[key] = calculate_null_data(value,feature_item[i])
    print(f'{key} : {null_dist[key]}')
    i+=1
    

In [None]:
plots_df = plots_df[plots_df['summary'] != 'None']
locations_df = locations_df[locations_df['locations'] != 'None']
base_df = base_df[base_df['detail_result'] != 'None']
box_df = box_df[box_df['detail_result'] != 'None']

### Read title.basic datafile and create a base feature file

In [None]:
original_film_list = script_df.tconst.unique().tolist()
film_list = set(original_film_list) - set(unwanted_movies)
film_list = set(film_list) - set(['tt6422744'])

imdb_data = df4[df4['tconst'].isin(film_list)]
imdb_data.rename(columns={'tconst': 'film_id'}, inplace = True)
print("title.basics: ",len(imdb_data.film_id.unique()))

In [None]:
imdb_data.head()

### Expand Base Details and merge in base feature file

#### Include country/countries of Origin

In [None]:
origin_df = base_df[((base_df["detail_item"] == "Country of origin") | 
                    (base_df["detail_item"] == "Countries of origin")) & (base_df['film_id'].isin(film_list))].reset_index()
origin_df.head()

filter_data = origin_df.groupby(['film_id', 'detail_item']).agg({"detail_result": ['count',  ', '.join]}).reset_index()
filter_data.columns = [col[0] if col[-1]=='' else col[-1] for col in filter_data.columns.values]
filter_data.rename(columns={'join': 'countryOfOrigin', 'count': 'countryOriginCount'}, inplace = True)
filter_data = filter_data.drop(['detail_item'], axis=1)

# merge data with film_Exploration file
new_data = pd.merge(filter_data, imdb_data, how="right", on=["film_id"])
new_data['countryOriginCount'] = np.where(new_data['countryOriginCount'].isnull(), 0.0, new_data['countryOriginCount'])
new_data.head()

#### Include release Date and release Location

In [None]:
from datetime import datetime
import re
# create a translate table
replace_table = {
    ord('('):f'',
    ord(')'):f''
}

# filter data for only release date item
base_details_df = base_df[(base_df["detail_item"] == "Release date") & (base_df['film_id'].isin(film_list))].reset_index()

base_details_df.rename(columns={'detail_result': 'releaseDate'}, inplace=True)
base_details_df['releaseLocation'] = base_details_df['releaseDate'].apply(lambda x: x[x.find("(")+1 : x.find(")")])
base_details_df['releaseLocation'] = base_details_df['releaseLocation'].apply(lambda x: x.translate(replace_table))
base_details_df['releaseDate'] = base_details_df['releaseDate'].apply(lambda x: re.sub(r'\(.*','',x))

base_details_df["releaseDate"] = pd.to_datetime(base_details_df["releaseDate"], format='%B %d, %Y ', errors='ignore').astype('datetime64[ns]')

# create another column for Year
base_details_df['releaseYear'] = base_details_df["releaseDate"].dt.year

base_details_df["releaseDate"] = pd.to_datetime(base_details_df["releaseDate"], format='%Y ', errors='ignore').astype('datetime64[ns]')
base_details_df = base_details_df.drop(['detail_item','index'], axis=1)

# merge data into film_exploration file
new_data = pd.merge(base_details_df, new_data, how="right", on=["film_id"])
new_data.head()

#### Include language data

In [None]:
def update_language_count(x):
    x_list = x.languages.split(",")
    if 'None' in x_list:
        x.languageCount -= 1
        return(x.languageCount)
    else:
        return x.languageCount
        
lang_df = base_df[((base_df["detail_item"] == "Language") | 
                    (base_df["detail_item"] == "Languages")) & (base_df['film_id'].isin(film_list))].reset_index()

lang_df['detail_result'] = np.where(lang_df['detail_result'].isnull(), 'None', lang_df['detail_result'])
lang_data = lang_df.groupby(['film_id', 'detail_item']).agg({"detail_result": ['count',  ', '.join]}).reset_index()
lang_data.columns = [col[0] if col[-1]=='' else col[-1] for col in lang_data.columns.values]
lang_data.rename(columns={'join': 'languages', 'count': 'languageCount'}, inplace = True)
lang_data['languageCount'] = lang_data.apply(lambda x: update_language_count(x), axis=1)
lang_data = lang_data.drop(['detail_item'], axis=1)

# merge data into film_exploration file
new_data = pd.merge(lang_data, new_data, how="right", on=["film_id"])
new_data['languageCount'] = np.where(new_data['languageCount'].isnull(), 0.0, new_data['languageCount'])
new_data.head()

#### Include production companies

In [None]:
prod_com_df = base_df[((base_df["detail_item"] == "Production company") | 
                    (base_df["detail_item"] == "Production companies")) & (base_df['film_id'].isin(film_list))].reset_index()

prod_data = prod_com_df.groupby(['film_id', 'detail_item']).agg({"detail_result": ['count',  ', '.join]}).reset_index()
prod_data.columns = [col[0] if col[-1]=='' else col[-1] for col in prod_data.columns.values]
prod_data.rename(columns={'join': 'productionCompanies', 'count': 'productionCompanyCount'}, inplace = True)
prod_data = prod_data.drop(['detail_item'], axis=1)

# merge data with film_Exploration file
new_data = pd.merge(prod_data, new_data, how="right", on=["film_id"])
new_data['productionCompanyCount'] = np.where(new_data['productionCompanyCount'].isnull(), 0.0, new_data['productionCompanyCount'])
new_data.head()

#### Include official sites

In [None]:
site_df = base_df[((base_df["detail_item"] == "Official site") | 
                    (base_df["detail_item"] == "Official sites")) & (base_df['film_id'].isin(film_list))].reset_index()

site_data = site_df.groupby(['film_id', 'detail_item']).agg({"detail_result": ['count',  ', '.join]}).reset_index()
site_data.columns = [col[0] if col[-1]=='' else col[-1] for col in site_data.columns.values]
site_data.rename(columns={'join': 'officialSites', 'count': 'officialSitesCount'}, inplace = True)
site_data = site_data.drop(['detail_item'], axis=1)

# merge data with film_Exploration file
new_data = pd.merge(site_data, new_data, how="right", on=["film_id"])
new_data['officialSitesCount'] = np.where(new_data['officialSitesCount'].isnull(), 0.0, new_data['officialSitesCount'])
new_data.head()

### Expand Box Office and merge in base feature file

#### Include box office - Gross worldwide details

In [None]:
gww_df = box_df[(box_df["detail_item"] == "Gross worldwide") & (box_df['film_id'].isin(film_list))].reset_index()

gww_df.rename(columns={'detail_result': 'GrossWorldwide'}, inplace = True)
gww_df = gww_df.drop(['detail_item','index'], axis=1)

# merge data with film_Exploration file
new_data = pd.merge(gww_df, new_data, how="right", on=["film_id"])
new_data['GrossWorldwide'] = np.where(new_data['GrossWorldwide'].isnull(), 0, new_data['GrossWorldwide'])
new_data.head()

#### Include box office - Gross US & Canada details

In [None]:
grossUS_df = box_df[(box_df["detail_item"] == "Gross US & Canada") & (box_df['film_id'].isin(film_list))].reset_index()

grossUS_df.rename(columns={'detail_result': 'GrossUSCanada'}, inplace = True)
grossUS_df = grossUS_df.drop(['detail_item','index'], axis=1)

# merge data with film_Exploration file
new_data = pd.merge(grossUS_df, new_data, how="right", on=["film_id"])
new_data['GrossUSCanada'] = np.where(new_data['GrossUSCanada'].isnull(), 0, new_data['GrossUSCanada'])
new_data.head()

#### Include box office - Budget details

In [None]:
budget_df = box_df[(box_df["detail_item"] == "Budget") & (box_df['film_id'].isin(film_list))].reset_index()

budget_df.rename(columns={'detail_result': 'budget'}, inplace = True)
budget_df = budget_df.drop(['detail_item','index'], axis=1)

# merge data with film_Exploration file
new_data = pd.merge(budget_df, new_data, how="right", on=["film_id"])
new_data['budget'] = np.where(new_data['budget'].isnull(), 0, new_data['budget'])
new_data.head()

#### Include box office - Opening weekend US & Canada details

In [None]:
openweek_df = box_df[(box_df["detail_item"] == "Opening weekend US & Canada") & (box_df['film_id'].isin(film_list))].reset_index()

weekend_earning = openweek_df.iloc[::2]
df_weekend = openweek_df.iloc[1::2]

weekend_earning.rename(columns={'detail_result': 'weekendEarning'}, inplace = True)
weekend_earning = weekend_earning.drop(['detail_item','index'], axis=1)

df_weekend.rename(columns={'detail_result': 'openingWeekend'}, inplace = True)
df_weekend = df_weekend.drop(['detail_item','index'], axis=1)

# merge data with film_Exploration file
new_data = pd.merge(weekend_earning, new_data, how="right", on=["film_id"])
new_data['weekendEarning'] = np.where(new_data['weekendEarning'].isnull(), 0, new_data['weekendEarning'])
new_data = pd.merge(df_weekend, new_data, how="right", on=["film_id"])

new_data.head()

#### Convert budget values to numeric

In [None]:
print(len(new_data))
print("null budget: ",len(new_data[new_data['budget'] == '0']))
print(len(new_data[new_data['budget'] != '0']))
print("null GrossUSCanada: ",len(new_data[new_data['GrossUSCanada'] == '0']))
print(len(new_data[new_data['GrossUSCanada'] != '0']))
print("null GrossWorldwide: ",len(new_data[new_data['GrossWorldwide'] == '0']))
print(len(new_data[new_data['GrossWorldwide'] != '0']))
print("null weekendEarning: ",len(new_data[new_data['weekendEarning'] == '0']))
print(len(new_data[new_data['weekendEarning'] != '0']))

In [None]:
from re import sub

def extract_money_value(text_data):
    if text_data != '0':
        # Define the regex pattern to match currency symbols followed by digits, commas, and periods until a space is found
        pattern = r'[\$\¥€£]?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
        match = re.findall(pattern, text_data)
        if match:
            value = float(sub(r'[^\d.]', '', match[0]))
        else:
            print(text_data)
            value=float(0)
        return value
    else:
        return float(0)

In [None]:
new_data['budget_est'] = new_data['budget'].progress_apply(lambda x: extract_money_value(str(x)))
new_data['GrossUSCanada'] = new_data['GrossUSCanada'].progress_apply(lambda x: extract_money_value(str(x)))
new_data['GrossWorldwide'] = new_data['GrossWorldwide'].progress_apply(lambda x: extract_money_value(str(x)))
new_data['weekendEarning'] = new_data['weekendEarning'].progress_apply(lambda x: extract_money_value(str(x)))

In [None]:
print(len(new_data))
print("null budget: ",len(new_data[new_data['budget_est'] == 0]))
print(len(new_data[new_data['budget_est'] != 0]))
print("null GrossUSCanada: ",len(new_data[new_data['GrossUSCanada'] == 0]))
print(len(new_data[new_data['GrossUSCanada'] != 0]))
print("null GrossWorldwide: ",len(new_data[new_data['GrossWorldwide'] == 0]))
print(len(new_data[new_data['GrossWorldwide'] != 0]))
print("null weekendEarning: ",len(new_data[new_data['weekendEarning'] == 0]))
print(len(new_data[new_data['weekendEarning'] != 0]))

#### Create Budget Label

In [None]:
def budget_label(x):
    if x >= 1 and x < 1000000:
        label_x = '< 1M'
    elif x >= 1000000 and x < 2000000: 
        label_x = '1M - 2M'
    elif x >= 2000000 and x < 5000000: 
        label_x = '2M - 5M'
    elif x >= 5000000 and x < 7000000: 
        label_x =  '5M - 7M'
    elif x >= 7000000 and x < 10000000: 
        label_x =  '7M - 10M'
    elif x >= 10000000 and x < 50000000:
        label_x =  '10M - 50M'
    elif x >= 50000000 and x < 100000000:
        label_x =  '50M - 100M'
    elif x >= 100000000 and x < 500000000:
        label_x =  '100M - 500M'
    elif x >= 500000000 and x < 1000000000:
        label_x =  '500M - 1B'
    elif x >= 1000000000 and x < 10000000000:
        label_x =  '1B - 10B'
    elif x >= 10000000000 and x < 100000000000: 
        label_x =  '10B - 100B'
    elif x >= 100000000000 and x < 200000000000:
        label_x = '100B - 200B'
    elif x >= 200000000000:
        label_x = '>200B'
    elif x == 0:
        label_x = None

    return label_x

In [None]:
new_data['budget_label'] = new_data['budget_est'].progress_apply(lambda x: budget_label(x))

#### Split multiple CountryOfOrigin and explode 

In [None]:
new_data['countryOfOrigin'] = np.where(new_data['countryOfOrigin'].isnull(), 'None', new_data['countryOfOrigin'])
new_data['coo_list'] = new_data['countryOfOrigin'].apply(lambda x: x.split(","))
new_data['coo_list'] = new_data['coo_list'].apply(lambda x: [elt.strip() for elt in x])
new_data = new_data.explode('coo_list')

#### Split genre and explode

In [None]:
new_data['genres'] = np.where(new_data['genres'].isnull(), 'None', new_data['genres'])
new_data['genre_list'] = new_data['genres'].apply(lambda x: x.split(","))
new_data['genre_list'] = new_data['genre_list'].apply(lambda x: [elt.strip() for elt in x])
new_data = new_data.explode('genre_list')

#### Create continent for CountryOfOrigin

In [None]:
new_data['coo_list'] = np.where(new_data['coo_list']=='West Germany', 'Germany', new_data['coo_list'])
new_data['coo_list'] = np.where(new_data['coo_list']=='East Germany', 'Germany', new_data['coo_list'])

In [None]:
import pycountry_convert as pc

country_list_europe = ['Czechoslovakia','Soviet Union','Yugoslavia','Netherlands Antilles','Federal Republic of Yugoslavia','Serbia and Montenegro',
                      'Kosovo','Vatican']
country_list_asia = ['Occupied Palestinian Territory','North Vietnam','Burma','Cocos Islands']
country_list_africa = ['Reunion','The Democratic Republic of Congo']
def extract_continent_from_country(x):
    if x in country_list_europe:
        return 'Europe'
    elif x in country_list_asia:
        return 'Asia'
    elif x in country_list_africa:
        return 'Africa'        
    elif x == 'None':
        return 'None'
    elif x == 'Antarctica':
        return 'Antarctica'
    else:
        try:
            country_code = pc.country_name_to_country_alpha2(x, cn_name_format="default")
            continent_name = pc.convert_continent_code_to_continent_name(pc.country_alpha2_to_continent_code(country_code))
        except:
            print(x)
        return continent_name

new_data['coo_continent'] = new_data['coo_list'].progress_apply(lambda x: extract_continent_from_country(x))

#### Binning data into decades

In [None]:
# Update Release date column and fill NULL values with startYear

new_data['releaseYear'] = np.where(new_data['releaseYear'].isna(), new_data['startYear'], new_data['releaseYear'])
new_data["releaseYear"] = pd.to_datetime(new_data["releaseYear"]).dt.year
new_data[new_data['releaseYear'].isna()]

In [None]:
bins = np.arange(new_data.releaseYear.min()-1, new_data.releaseYear.max(), 10).tolist()
new_data['decades'] = pd.cut(x=new_data['releaseYear'], bins=bins)

#### create Film Era bins

In [None]:
eras = ["Silent Era", "Transition to Sound", "Golden Age of Hollywood", "New Hollywood Era", "Blockbuster Era", "Streaming Era"]
new_data['film_era'] = pd.cut(new_data['releaseYear'], [1894, 1920, 1930, 1960, 1980, 2000, 2024], labels=eras)

### Expand and refine Location file

#### Separate scene from locations

In [None]:
locations_df['scene'] = locations_df['locations'].apply(lambda x: x[x.find("(")+1 : x.find(")")] if x.find(")") != -1 else '\\N')
locations_df['locations'] = locations_df['locations'].apply(lambda x: re.sub(r'\(.*','',x))
locations_df.head()

#### Create area, state, country, continent for locations

In [None]:

def get_continent_name(continent_code: str) -> str:
    continent_dict = {
        "NA": "North America",
        "SA": "South America",
        "AS": "Asia",
        "AF": "Africa",
        "OC": "Oceania",
        "EU": "Europe",
        "AQ" : "Antarctica"
    }
    return continent_dict[continent_code]


In [None]:
from geopy.geocoders import Nominatim
from geopy.geocoders import Photon
from urllib.request import Request
from geopy.extra.rate_limiter import RateLimiter
import random

def get_random_user_agent():
    agents = ['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
              'Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; MAARJS; rv:11.0) like Gecko']
    return random.choice(agents)
    
def parse_country(x, recursion=0):
    x_list = x.split(", ")
    if x_list[-1] == 'USA':
        x_list[-1] = 'United States'
    elif x_list[-1] == 'UK':
        x_list[-1] = 'United Kingdom'
    x = ', '.join(x_list)
    geolocator = Photon(user_agent="measurements")
    #add_list = x.split(", ")
    #country = add_list[-1]
    try:
        location = geolocator.geocode(x,timeout=None)
        if (not location) and len(x_list) > 2:
            x = ', '.join(x_list[-2:])
            location = geolocator.geocode(x,timeout=None)
    except:
        time.sleep(1) # wait a bit
        return parse_country(x)
    if location:
        #print(location.address)
        #print(location.raw)
        return location.address, location.raw
    else:
        return None, None

def get_country(x):
    if x.split(", ")[-1] == 'Fiji' or x.split(", ")[-1] =='Tokelau':
        if len(x.split(", ")) > 1:
            return 'None', 'None', x.split(", ")[-2], x.split(", ")[-1], 'None','None', 'Oceania'
        else:
            return 'None', 'None', 'None', x.split(", ")[-1], 'None','None', 'Oceania'
    elif x.split(", ")[-1] =='East Timor':
        if len(x.split(", ")) > 1:
            return 'None', 'None', x.split(", ")[-2], x.split(", ")[-1], 'None','None', 'Asia'
        else:
            return 'None', 'None', 'None', x.split(", ")[-1], 'None','None', 'Asia'        
    else:
        adress, coordinates = parse_country(x)
        if adress:
            prop = coordinates['properties']
            coord = coordinates['geometry']
            if "city" in prop:
                city = prop['city']
            elif "county" in prop:
                city = prop['county']
            else:
                city = 'None'
            if "state" in prop:
                state = prop['state']
            else:
                state = 'None'
            if "country" in prop:
                country = prop['country']
            else:
                country = 'None'
            if 'coordinates' in coord:
                latLong = coord['coordinates']
                lat = latLong[0]
                long = latLong[1]
            else:
                lat = 'None'
                long = 'None'
    
            if 'countrycode' in prop:
                country_code = prop['countrycode']
            else:
                country_code = None
    
            if country != 'None':
                continent = get_Continent(country)
                if (not continent) and (country_code):
                    continent = get_continent_from_code(country_code)
            else:
                continent = 'None'
            return adress, city, state, country, lat, long, continent
        else:
            return 'None', 'None', 'None', 'None', 'None','None', 'None'

In [None]:
import pycountry_convert
import pycountry
from functools import partial
import re

def get_continent_from_code(x):
    if x is not None:
        if x == 'VA':
            return 'Europe'
        if x == 'PN':
            return 'Oceania'
        else:
            geolocator = Nominatim(user_agent='https')
            try:
                geocode = partial(geolocator.geocode, language="es", timeout=None)
            except:
                time.sleep(1) # wait a bit
                return get_Continent(x)
            country_continent_name = pycountry_convert.country_alpha2_to_continent_code(x)
            #country_continent_name = get_continent_name(x)
            if country_continent_name:
                return country_continent_name
            else:
                return None

    
def get_Continent(x):
    if x != '\\N':
        geolocator = Nominatim(user_agent='https')
        try:
            geocode = partial(geolocator.geocode, language="es", timeout=None)
        except:
            time.sleep(1) # wait a bit
            return get_Continent(x)
        x = str(geocode(x, language="en"))
        x_list = x.split(",")
        if len(x_list) >1:
            xname = re.sub(r'^ ','',x_list[-1])
        else:
            xname = x_list[0]
        country = pycountry.countries.get(name=xname)
        #print(pycountry_convert.country_alpha2_to_continent_code(pycountry.countries.get(name=x).alpha_2))
        if country is None:
            return None
        continent_code = pycountry_convert.country_alpha2_to_continent_code(country.alpha_2)
        country_continent_name = get_continent_name(continent_code)
    else:
        #country_continent_name = 'None'
        return None
    return country_continent_name

In [None]:
#Remove the comment before executing - below execution takes long time
## Converted full file is already placed at the location
#locations_df[['address','area','state','country','lattitude','longitude','continent']] = locations_df.progress_apply(lambda x: get_country(x.locations), axis=1).to_list()

#### Write to file - expanded location information

In [None]:
# Remove the comment before executing - 
#locations_df.to_csv(os.path.join(os.path.dirname(__name__), "ProcessedDataFiles/film_loc_with_continent.tsv"), sep='\t', index=False)

filename = os.path.join(os.path.dirname(__name__), "ProcessedDataFiles/film_loc_with_continent.tsv")
expand_locations_df = pd.read_csv(filename, delimiter='\t',low_memory=False)
expand_locations_df.head(2)

### merge locations to base feature file 

In [None]:
locations_data = expand_locations_df[['film_id','country','continent']]
locations_data.drop_duplicates(inplace=True)
new_data = pd.merge(new_data, locations_data, how="left", on=["film_id"])
new_data.head()

### Write to base feature file

In [None]:
new_data.to_csv(os.path.join(os.path.dirname(__name__), "ProcessedDataFiles/film_exploration_sample.tsv"), sep='\t', index=False)