In [1]:
import pandas as pd
import re
import seaborn as sb
import matplotlib.pyplot as plt
import datetime
import json

In [2]:
df_movies = pd.read_csv("the-movies-dataset/movies_metadata.csv")
#df_movies.head(3)
# filter movies on Status == 'Released' and drop status column
df_movies = df_movies[df_movies["status"] == 'Released']
df_movies.drop(columns=["status"])
len(df_movies)
df_movies = df_movies[0:100]
len(df_movies)

100

In [3]:
# one hot encoding of genres (and other list columns)
# one hot encoding of genres: (takes ~3-4 minutes)
# 1. get genres
genre_set = {"Action","Drama","Comedy","Thriller","Animation"}

for g in genre_set:
    df_movies[g] = 0

# 2. add genres as columns to dataframe, default value = 0
df_movie_genres = pd.DataFrame(columns=genre_set)
for index, row in df_movies.iterrows():
    temp = pd.Series(index=genre_set)
    tmp_genres = []  
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres:
        if genre.get("name") in list(genre_set):
            tmp_genres.append(genre.get("name"))
    for genre in tmp_genres: # turn string to json and loop over genres
        #print(str(index) + ": " + str(genre))
        temp[genre] = (1/len(tmp_genres))
    df_movie_genres = df_movie_genres.append(temp, ignore_index=True)
#print("One hot encoding finished")

# 3. add one hot encoded genres to df_movies        
df_movies[list(genre_set)] = df_movie_genres

# 4. turn NaN values to zeros
df_movies = df_movies.fillna(0)

df_movies.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,Drama,Comedy,Thriller,Action,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0,Toy Story,False,7.7,5415,0.0,0.5,0.0,0.0,0.5


In [4]:
df_movies = df_movies.drop(columns=['genres','imdb_id','original_title','overview','popularity','poster_path','release_date','revenue','status','tagline','title','video','vote_count','vote_average'])
df_movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,original_language,production_companies,production_countries,runtime,spoken_languages,Drama,Comedy,Thriller,Action,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",0.0,0.5,0.0,0.0,0.5
1,False,0,65000000,0,8844,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.0,0.0,0.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,0,15602,en,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",0.0,1.0,0.0,0.0,0.0


In [5]:
# production companies, production countries, spoken_languages --> hot encoding?

In [6]:
# durchschnitt von predictions berechnen
df_ratings = pd.read_csv("the-movies-dataset/ratings.csv")
df_ratings = df_ratings[0:100000]
df_ratings_grouped = df_ratings.groupby('movieId')[['rating']].mean()
df_joined = pd.merge(left=df_movies,right=df_ratings_grouped, left_on='id', right_on='movieId')
df_joined.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,original_language,production_companies,production_countries,runtime,spoken_languages,Drama,Comedy,Thriller,Action,Animation,rating
0,False,0,60000000,0,949,en,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.333333,0.0,0.333333,0.333333,0.0,3.916667
1,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,http://www.mgm.com/view/movie/757/Goldeneye/,710,en,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.5,0.5,0.0,3.0
2,False,0,98000000,0,1408,en,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.0,1.0,0.0,3.763636


In [7]:
# cast und crew integrieren
df_credits = pd.read_csv("the-movies-dataset/credits.csv")
df_credits.head(5)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [8]:
pattern = '\"Director\", \"name\": \"([a-zA-Z ]*)\", \"'
directors = set()
df_credits['director'] = ""
for index, row in df_credits.iterrows():
    text = row.crew
    text = text.replace('\\','',-1)
    text = text.replace("'",'"',-1)
    a = re.search(pattern, text)
    if a != None:
        directors.add(a.group(1))
        df_credits.set_value(index, 'director', a.group(1))
print(len(directors))
#print(directors)
df_credits.head(10)

  # This is added back by InteractiveShellApp.init_path()


14945


Unnamed: 0,cast,crew,id,director
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,John Lasseter
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,Joe Johnston
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,Howard Deutch
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,Forest Whitaker
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,Charles Shyer
5,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949,Michael Mann
6,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...",11860,Sydney Pollack
7,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",45325,Peter Hewitt
8,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...",9091,Peter Hyams
9,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710,Martin Campbell


In [9]:
import json
pattern = '\"name\": \"(.*)\", '
df_credits['actors'] = ""
for index, row in df_credits.iterrows():
    actors = []
    text = row.cast
    text = text.replace('\\','',-1)
    text = text.replace("'",'"',-1)
    liste = text.split("cast_id")
    for i in liste:
        tmp_name = re.search(pattern, i)
        if tmp_name != None:
            actors.append(tmp_name.group(1))
    df_credits.set_value(index, 'actors', ','.join(map(str, actors)))
df_credits.head(10)

  


Unnamed: 0,cast,crew,id,director,actors
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ..."
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,..."
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl..."
5,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight..."
6,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...",11860,Sydney Pollack,"Harrison Ford,Julia Ormond,Greg Kinnear,Angie ..."
7,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",45325,Peter Hewitt,"Jonathan Taylor Thomas,Brad Renfro,Rachael Lei..."
8,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...",9091,Peter Hyams,"Jean-Claude Van Damme,Powers Boothe,Dorian Har..."
9,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam..."


In [10]:
df_credits = df_credits.drop(columns=['cast','crew'])
df_joined = pd.merge(left=df_joined,right=df_credits, left_on='id', right_on='id')
df_joined.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,original_language,production_companies,production_countries,runtime,spoken_languages,Drama,Comedy,Thriller,Action,Animation,rating,director,actors
0,False,0,60000000,0,949,en,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.333333,0.0,0.333333,0.333333,0.0,3.916667,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight..."
1,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,http://www.mgm.com/view/movie/757/Goldeneye/,710,en,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.5,0.5,0.0,3.0,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam..."
2,False,0,98000000,0,1408,en,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.0,1.0,0.0,3.763636,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur..."
3,False,0,52000000,0,524,en,"[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",178.0,"[{'iso_639_1': 'en', 'name': 'English'}]",1.0,0.0,0.0,0.0,0.0,3.121951,Martin Scorsese,"Robert De Niro,Sharon Stone,Joe Pesci,James Wo..."
4,False,0,4000000,0,5,en,"[{'name': 'Miramax Films', 'id': 14}, {'name':...","[{'iso_3166_1': 'US', 'name': 'United States o...",98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",0.0,1.0,0.0,0.0,0.0,3.076271,Allison Anders,"Tim Roth,Antonio Banderas,Jennifer Beals,Madon..."


In [11]:
# belongs to, homepage --> binär
df_joined['part_of_collection'] = ""
df_joined['+18'] = ""
df_joined['hasHomepage'] = ""
for index, row in df_joined.iterrows():
    if row.belongs_to_collection == "0":
        df_joined.set_value(index, 'part_of_collection', 0)
    else:
        df_joined.set_value(index, 'part_of_collection', 1)
    
    if row.adult == "True":
        df_joined.set_value(index, '+18', 1)
    else:
        df_joined.set_value(index, '+18', 0)
    
    if row.homepage == "[]":
        df_joined.set_value(index, 'hasHomepage', 0)
    else:
        df_joined.set_value(index, 'hasHomepage', 1)
df_joined = df_joined.drop(columns=['belongs_to_collection','adult','homepage'])
df_joined.head(5)

  if __name__ == '__main__':
  


Unnamed: 0,budget,id,original_language,production_companies,production_countries,runtime,spoken_languages,Drama,Comedy,Thriller,Action,Animation,rating,director,actors,part_of_collection,+18,hasHomepage
0,60000000,949,en,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.333333,0.0,0.333333,0.333333,0.0,3.916667,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight...",1,0,1
1,58000000,710,en,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.5,0.5,0.0,3.0,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam...",1,0,1
2,98000000,1408,en,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.0,1.0,0.0,3.763636,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur...",1,0,1
3,52000000,524,en,"[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",178.0,"[{'iso_639_1': 'en', 'name': 'English'}]",1.0,0.0,0.0,0.0,0.0,3.121951,Martin Scorsese,"Robert De Niro,Sharon Stone,Joe Pesci,James Wo...",1,0,1
4,4000000,5,en,"[{'name': 'Miramax Films', 'id': 14}, {'name':...","[{'iso_3166_1': 'US', 'name': 'United States o...",98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",0.0,1.0,0.0,0.0,0.0,3.076271,Allison Anders,"Tim Roth,Antonio Banderas,Jennifer Beals,Madon...",1,0,1


In [19]:
import json
pattern1 = "{'name': '([a-zA-Z ]*)'"
pattern2 = "'name': '([a-zA-Z ]*)'"

df_joined['productionCompanies'] = ""
df_joined['productionCountries'] = ""
df_joined['spokenLanguages'] = ""
for index, row in df_joined.iterrows():
    companies = []
    countries = []
    lang = []
    
    text1 = row.production_companies
    liste1 = text1.split("}")
    for i in liste1:
        tmp_name = re.search(pattern1, i)
        if tmp_name != None:
            companies.append(tmp_name.group(1))
    df_joined.set_value(index, 'productionCompanies', ', '.join(map(str, companies)))
    
    text2 = row.production_countries
    liste2 = text2.split("}")
    for i in liste2:
        tmp_name = re.search(pattern2, i)
        if tmp_name != None:
            countries.append(tmp_name.group(1))
    df_joined.set_value(index, 'productionCountries', ', '.join(map(str, countries)))
    
    text3 = row.spoken_languages
    liste3 = text3.split("}")
    for i in liste3:
        tmp_name = re.search(pattern2, i)
        if tmp_name != None:
            lang.append(tmp_name.group(1))
    df_joined.set_value(index, 'spokenLanguages', ', '.join(map(str, lang)))
    
    
df_joined.head(5)



Unnamed: 0,budget,id,original_language,production_companies,production_countries,runtime,spoken_languages,Drama,Comedy,Thriller,...,Animation,rating,director,actors,part_of_collection,+18,hasHomepage,productionCompanies,productionCountries,spokenLanguages
0,60000000,949,en,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.333333,0.0,0.333333,...,0.0,3.916667,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight...",1,0,1,"Regency Enterprises, Forward Pass",United States of America,English
1,58000000,710,en,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.5,...,0.0,3.0,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam...",1,0,1,"United Artists, Eon Productions","United Kingdom, United States of America",English
2,98000000,1408,en,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",0.0,0.0,0.0,...,0.0,3.763636,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur...",1,0,1,"Laurence Mark Productions, Carolco Pictures","France, Germany, Italy, United States of America","English, Latin"
3,52000000,524,en,"[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",178.0,"[{'iso_639_1': 'en', 'name': 'English'}]",1.0,0.0,0.0,...,0.0,3.121951,Martin Scorsese,"Robert De Niro,Sharon Stone,Joe Pesci,James Wo...",1,0,1,"Universal Pictures, Syalis DA","France, United States of America",English
4,4000000,5,en,"[{'name': 'Miramax Films', 'id': 14}, {'name':...","[{'iso_3166_1': 'US', 'name': 'United States o...",98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",0.0,1.0,0.0,...,0.0,3.076271,Allison Anders,"Tim Roth,Antonio Banderas,Jennifer Beals,Madon...",1,0,1,"Miramax Films, A Band Apart",United States of America,English


In [20]:
df_joined = df_joined.drop(columns=['production_companies','production_countries','spoken_languages'])
df_joined.head(5)

Unnamed: 0,budget,id,original_language,runtime,Drama,Comedy,Thriller,Action,Animation,rating,director,actors,part_of_collection,+18,hasHomepage,productionCompanies,productionCountries,spokenLanguages
0,60000000,949,en,170.0,0.333333,0.0,0.333333,0.333333,0.0,3.916667,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight...",1,0,1,"Regency Enterprises, Forward Pass",United States of America,English
1,58000000,710,en,130.0,0.0,0.0,0.5,0.5,0.0,3.0,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam...",1,0,1,"United Artists, Eon Productions","United Kingdom, United States of America",English
2,98000000,1408,en,119.0,0.0,0.0,0.0,1.0,0.0,3.763636,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur...",1,0,1,"Laurence Mark Productions, Carolco Pictures","France, Germany, Italy, United States of America","English, Latin"
3,52000000,524,en,178.0,1.0,0.0,0.0,0.0,0.0,3.121951,Martin Scorsese,"Robert De Niro,Sharon Stone,Joe Pesci,James Wo...",1,0,1,"Universal Pictures, Syalis DA","France, United States of America",English
4,4000000,5,en,98.0,0.0,1.0,0.0,0.0,0.0,3.076271,Allison Anders,"Tim Roth,Antonio Banderas,Jennifer Beals,Madon...",1,0,1,"Miramax Films, A Band Apart",United States of America,English
