## Importing the data

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import ast

data = pd.read_csv('movies_metadata.csv', skiprows=[19730, 19731, 29503, 29504, 35587, 35588],
                   usecols=['id', 'original_title', 'original_language', 'genres', 'production_companies',
                            'production_countries', 'runtime', 'vote_average'])

## Preprocessing the data

In [12]:
data = data.drop_duplicates(subset='id', ignore_index=True)

data.head(10)

Unnamed: 0,genres,id,original_language,original_title,production_companies,production_countries,runtime,vote_average
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,7.7
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,6.9
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,Grumpier Old Men,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,6.5
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,Waiting to Exhale,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",127.0,6.1
4,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Father of the Bride Part II,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",106.0,5.7
5,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",949,en,Heat,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,7.7
6,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",11860,en,Sabrina,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...",127.0,6.2
7,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",45325,en,Tom and Huck,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",97.0,5.4
8,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",9091,en,Sudden Death,"[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'US', 'name': 'United States o...",106.0,5.5
9,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",710,en,GoldenEye,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,6.6


In [13]:
data['genres'] = data['genres'].apply(lambda genres_list: [genres['name'] for genres in ast.literal_eval(genres_list)])

data['production_companies'] = data['production_companies'].apply(lambda companies_list: [companies['name'] for companies in ast.literal_eval(companies_list)])

data['production_countries'] = data['production_countries'].apply(lambda countries_list: [countries['iso_3166_1'] for countries in ast.literal_eval(countries_list)])

data.head(10)

Unnamed: 0,genres,id,original_language,original_title,production_companies,production_countries,runtime,vote_average
0,"[Animation, Comedy, Family]",862,en,Toy Story,[Pixar Animation Studios],[US],81.0,7.7
1,"[Adventure, Fantasy, Family]",8844,en,Jumanji,"[TriStar Pictures, Teitler Film, Interscope Co...",[US],104.0,6.9
2,"[Romance, Comedy]",15602,en,Grumpier Old Men,"[Warner Bros., Lancaster Gate]",[US],101.0,6.5
3,"[Comedy, Drama, Romance]",31357,en,Waiting to Exhale,[Twentieth Century Fox Film Corporation],[US],127.0,6.1
4,[Comedy],11862,en,Father of the Bride Part II,"[Sandollar Productions, Touchstone Pictures]",[US],106.0,5.7
5,"[Action, Crime, Drama, Thriller]",949,en,Heat,"[Regency Enterprises, Forward Pass, Warner Bros.]",[US],170.0,7.7
6,"[Comedy, Romance]",11860,en,Sabrina,"[Paramount Pictures, Scott Rudin Productions, ...","[DE, US]",127.0,6.2
7,"[Action, Adventure, Drama, Family]",45325,en,Tom and Huck,[Walt Disney Pictures],[US],97.0,5.4
8,"[Action, Adventure, Thriller]",9091,en,Sudden Death,"[Universal Pictures, Imperial Entertainment, S...",[US],106.0,5.5
9,"[Adventure, Action, Thriller]",710,en,GoldenEye,"[United Artists, Eon Productions]","[GB, US]",130.0,6.6


## Filtering out small and insignificant features to reduce the number of overall features
Finding the number of genres, companies and countries

In [14]:
genres_dict = {}
companies_dict = {}
countries_dict = {}

for index, row in data.iterrows():
    for genre in row['genres']:
        genres_dict[genre] = genres_dict.get(genre, 0) + 1

    for company in row['production_companies']:
        companies_dict[company] = companies_dict.get(company, 0) + 1

    for country in row['production_countries']:
        countries_dict[country] = countries_dict.get(country, 0) + 1

In [15]:
print(len(genres_dict))
print(len(companies_dict))
print(len(countries_dict))

20
23537
161


In [16]:
companies_dict

{'Pixar Animation Studios': 52,
 'TriStar Pictures': 197,
 'Teitler Film': 2,
 'Interscope Communications': 36,
 'Warner Bros.': 1250,
 'Lancaster Gate': 2,
 'Twentieth Century Fox Film Corporation': 836,
 'Sandollar Productions': 11,
 'Touchstone Pictures': 225,
 'Regency Enterprises': 106,
 'Forward Pass': 9,
 'Paramount Pictures': 1001,
 'Scott Rudin Productions': 48,
 'Mirage Enterprises': 24,
 'Constellation Entertainment': 3,
 'Worldwide': 2,
 'Mont Blanc Entertainment GmbH': 1,
 'Walt Disney Pictures': 263,
 'Universal Pictures': 830,
 'Imperial Entertainment': 7,
 'Signature Entertainment': 8,
 'United Artists': 279,
 'Eon Productions': 23,
 'Columbia Pictures': 431,
 'Castle Rock Entertainment': 78,
 'Enigma Pictures': 4,
 'Amblin Entertainment': 76,
 'Amblimation': 3,
 'Hollywood Pictures': 84,
 'Cinergi Pictures Entertainment': 16,
 'Le Studio Canal+': 12,
 'Laurence Mark Productions': 8,
 'Metro-Goldwyn-Mayer (MGM)': 1074,
 'Carolco Pictures': 41,
 'Légende Entreprises': 2,

# Filtering out top 100 companies

In [17]:
companies_dict = [company for company, freq in sorted(companies_dict.items(), key=lambda x:x[1], reverse=True)[:100]]
companies_dict

['Warner Bros.',
 'Metro-Goldwyn-Mayer (MGM)',
 'Paramount Pictures',
 'Twentieth Century Fox Film Corporation',
 'Universal Pictures',
 'Columbia Pictures Corporation',
 'Canal+',
 'Columbia Pictures',
 'RKO Radio Pictures',
 'United Artists',
 'New Line Cinema',
 'Walt Disney Pictures',
 'Touchstone Pictures',
 'TriStar Pictures',
 'Mosfilm',
 'Miramax Films',
 'France 2 Cinéma',
 'Centre National de la Cinématographie (CNC)',
 'Toho Company',
 'BBC Films',
 'Gaumont',
 'StudioCanal',
 'Relativity Media',
 'British Broadcasting Corporation (BBC)',
 'TF1 Films Production',
 'Walt Disney Productions',
 'Orion Pictures',
 'Regency Enterprises',
 'Village Roadshow Pictures',
 'Lionsgate',
 'Universal International Pictures (UI)',
 'Dimension Films',
 'Zweites Deutsches Fernsehen (ZDF)',
 'DreamWorks SKG',
 'Wild Bunch',
 'Fox Searchlight Pictures',
 'Hammer Film Productions',
 'American International Pictures (AIP)',
 'Rai Cinema',
 'Working Title Films',
 'Svensk Filmindustri (SF)',
 'H

In [18]:
data['production_companies'] = data['production_companies'].apply(lambda companies_list: [company for company in companies_list if company in companies_dict])

In [19]:
data.head(1000)

Unnamed: 0,genres,id,original_language,original_title,production_companies,production_countries,runtime,vote_average
0,"[Animation, Comedy, Family]",862,en,Toy Story,[Pixar Animation Studios],[US],81.0,7.7
1,"[Adventure, Fantasy, Family]",8844,en,Jumanji,[TriStar Pictures],[US],104.0,6.9
2,"[Romance, Comedy]",15602,en,Grumpier Old Men,[Warner Bros.],[US],101.0,6.5
3,"[Comedy, Drama, Romance]",31357,en,Waiting to Exhale,[Twentieth Century Fox Film Corporation],[US],127.0,6.1
4,[Comedy],11862,en,Father of the Bride Part II,[Touchstone Pictures],[US],106.0,5.7
...,...,...,...,...,...,...,...,...
995,"[Animation, Family, Music]",15947,en,The Three Caballeros,[Walt Disney Pictures],[US],71.0,6.2
996,"[Animation, Family]",9078,en,The Sword in the Stone,[Walt Disney Productions],[US],79.0,6.9
997,"[Animation, Drama, Family]",29682,en,So Dear to My Heart,[Walt Disney Productions],[US],79.0,6.3
998,[Adventure],8367,en,Robin Hood: Prince of Thieves,"[Warner Bros., Morgan Creek Productions]",[US],143.0,6.6
