In [10]:
import pandas as pd
import re
import seaborn as sb
import matplotlib.pyplot as plt
import datetime
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [11]:
df_movies = pd.read_csv("the-movies-dataset/movies_metadata.csv")
print("Length at import: " + str(len(df_movies)))
#df_movies.head(3)
# filter movies on Status == 'Released' and drop status column
df_movies = df_movies[df_movies["status"] == 'Released']
df_movies.drop(columns=["status"])
print("Length only released: " + str(len(df_movies)))
#df_movies = df_movies[0:1000]
print("Length: " + str(len(df_movies)))
df_movies_raw = df_movies.copy()

Length at import: 45463
Length only released: 45017
Length: 45017


In [36]:
genres_stat = {}
for index, row in df_movies.iterrows():
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres: # turn string to json and loop over genres
        genres_stat.update({genre.get("name") : 0})

#Get the movie count per genre
movies = {}
for index, row in df_movies.iterrows():
    tmp_genres = []  
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres:
        tmp_genres.append(genre.get("name"))
        new_value = genres_stat.get(genre.get("name")) + 1
        genres_stat.update({genre.get("name") : new_value})
    movies.update({row.id:{'title': row.title, 'genres': tmp_genres}})
print(len(movies))

for key, value in sorted(genres_stat.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

AttributeError: 'Series' object has no attribute 'genres'

In [12]:
# one hot encoding of genres (and other list columns)
# one hot encoding of genres: (takes ~3-4 minutes)
# 1. get genres
#genre_set = {"Action","Drama","Comedy","Thriller","Animation"}

genre_set = {""}
for index, row in df_movies.iterrows():
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres: # turn string to json and loop over genres
        genre_set.add(genre.get("name"))       
genre_set.remove("")

for g in genre_set:
    df_movies[g] = 0

# 2. add genres as columns to dataframe, default value = 0
df_movie_genres = pd.DataFrame(columns=genre_set)
for index, row in df_movies.iterrows():
    temp = pd.Series(index=genre_set)
    tmp_genres = []  
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres:
        if genre.get("name") in list(genre_set):
            tmp_genres.append(genre.get("name"))
    for genre in tmp_genres: # turn string to json and loop over genres
        #temp[genre] = (1/len(tmp_genres))
        temp[genre] = 1
    df_movie_genres = df_movie_genres.append(temp, ignore_index=True)
#print("One hot encoding finished")

# 3. add one hot encoded genres to df_movies        
df_movies[list(genre_set)] = df_movie_genres

# 4. turn NaN values to zeros
df_movies = df_movies.fillna(0)

df_movies.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,TV Movie,Mystery,Animation,Foreign,Adventure,Romance,Family,Horror,Crime,Action
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
#df_movies = df_movies.drop(columns=['genres','imdb_id','original_title','overview','popularity','poster_path','release_date','revenue','status','tagline','title','video','vote_count','vote_average'])
df_movies = df_movies.drop(columns=['genres','original_title','overview','popularity','poster_path','release_date','revenue','status','tagline','title','video','vote_average'])
df_movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,production_companies,production_countries,runtime,...,TV Movie,Mystery,Animation,Foreign,Adventure,Romance,Family,Horror,Crime,Action
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,False,0,65000000,0,8844,tt0113497,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,0,15602,tt0113228,en,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [30]:
# durchschnitt von predictions berechnen
df_links = pd.read_csv("the-movies-dataset/links.csv")
print("Length at import (Links): " + str(len(df_links)))

df_ids = pd.DataFrame(columns=['ratingID'])
df_ids.ratingID = df_ids.ratingID.astype('int64')
pattern = "^tt(\d*)"
for index, row in df_movies.iterrows():
    temp = pd.Series(['ratingID'])
    newValue = str(row.imdb_id).replace("t","",-1)
    temp['ratingID'] = int(newValue)
    df_ids = df_ids.append(temp, ignore_index=True)
df_movies['ratingID'] = df_ids['ratingID']
print("Länge: ", len(df_ids['ratingID']))

df_tmp = pd.merge(left=df_movies,right=df_links, left_on='ratingID', right_on='imdbId', how='inner')
#print(df_tmp.head(5))
print("Length after Merge (Movies + Links): " + str(len(df_tmp)))

#######
df_ratings = pd.read_csv("the-movies-dataset/ratings.csv")
#df_ratings = df_ratings[0:100000]
print("Length at import (Ratings): " + str(len(df_ratings)))
df_ratings_grouped = df_ratings.groupby('movieId')[['rating']].mean()
#print(df_ratings_grouped.head(5))
df_joined = pd.merge(left=df_tmp,right=df_ratings_grouped, left_on='movieId', right_on='movieId', how='inner')
print("Length after merge (Movies + Links + Ratings): " + str(len(df_joined)))

Length at import (Links): 45843
Länge:  45017
Length after Merge (Movies + Links): 44508
Length at import (Ratings): 26024289
Length after merge (Movies + Links + Ratings): 43803
   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                  0  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                     0   8844  tt0113497                en   
2                                     0  15602  tt0113228                en   

                                production_companies  \
0     [{'name': 'Pixar Animation Studios', 'id': 3}]   
1  [{'name': 'TriStar Pictures', 'id': 559}, {'na...   
2  [{'name': 'Warner Bros.', 'id': 6194}, {

In [35]:
## Export: ID + Genres
df_export = df_joined.copy()
features_to_remove = ['id','imdb_id','movieId','tmdbId','ratingID','production_countries','production_companies','adult','belongs_to_collection','budget','homepage','runtime','original_language','spoken_languages','vote_count']
for i in features_to_remove:
    if i in df_export.columns:
        df_export = df_export.drop(columns=i)
df_export.to_csv("clusterPreprocessing.csv", index=False)

In [6]:
# cast und crew integrieren
df_credits = pd.read_csv("the-movies-dataset/credits.csv")
df_credits.head(5)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [34]:
pattern = '\"Director\", \"name\": \"([a-zA-Z ]*)\", \"'
directors = set()
df_credits['director'] = ""
for index, row in df_credits.iterrows():
    text = row.crew
    text = text.replace('\\','',-1)
    text = text.replace("'",'"',-1)
    a = re.search(pattern, text)
    if a != None:
        directors.add(a.group(1))
        df_credits.set_value(index, 'director', a.group(1))
print(len(directors))
#print(directors)
df_credits.head(10)

14945


Unnamed: 0,cast,crew,id,director
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,John Lasseter
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,Joe Johnston
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,Howard Deutch
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,Forest Whitaker
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,Charles Shyer
5,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949,Michael Mann
6,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...",11860,Sydney Pollack
7,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",45325,Peter Hewitt
8,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...",9091,Peter Hyams
9,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710,Martin Campbell


In [35]:
import json
pattern = '\"name\": \"(.*)\", '
df_credits['actors'] = ""
for index, row in df_credits.iterrows():
    actors = []
    text = row.cast
    text = text.replace('\\','',-1)
    text = text.replace("'",'"',-1)
    liste = text.split("cast_id")
    for i in liste:
        tmp_name = re.search(pattern, i)
        if tmp_name != None:
            actors.append(tmp_name.group(1))
    df_credits.set_value(index, 'actors', ','.join(map(str, actors)))
df_credits.head(10)

Unnamed: 0,cast,crew,id,director,actors
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ..."
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,..."
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl..."
5,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight..."
6,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...",11860,Sydney Pollack,"Harrison Ford,Julia Ormond,Greg Kinnear,Angie ..."
7,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",45325,Peter Hewitt,"Jonathan Taylor Thomas,Brad Renfro,Rachael Lei..."
8,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...",9091,Peter Hyams,"Jean-Claude Van Damme,Powers Boothe,Dorian Har..."
9,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam..."


In [36]:
df_credits = df_credits.drop(columns=['cast','crew'])
df_joined = pd.merge(left=df_joined,right=df_credits, left_on='id', right_on='id', how='inner')
print("Length after merge (Movies + Ratings + Credits): " + str(len(df_joined)))
df_joined.head(5)

Length after merge (Movies + Ratings + Credits): 7517


Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,original_language,production_companies,production_countries,runtime,spoken_languages,...,Thriller,TV Movie,Crime,Science Fiction,Fantasy,War,Adventure,rating,director,actors
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.59893,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal..."
1,False,0,65000000,0,8844,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3.760163,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra..."
2,False,0,60000000,0,949,en,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.905544,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight..."
3,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,http://www.mgm.com/view/movie/757/Goldeneye/,710,en,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.740334,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam..."
4,False,0,98000000,0,1408,en,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.710181,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur..."


In [37]:
# belongs to, homepage --> binär
df_joined['part_of_collection'] = ""
df_joined['+18'] = ""
df_joined['hasHomepage'] = ""
for index, row in df_joined.iterrows():
    if row.belongs_to_collection == 0:
        df_joined.set_value(index, 'part_of_collection', 0)
    else:
        df_joined.set_value(index, 'part_of_collection', 1)
    
    if row.adult == "True":
        df_joined.set_value(index, '18+', 1)
    else:
        df_joined.set_value(index, '18+', 0)   
    
    pattern = 'www.'
    tmp = re.search(pattern, str(row.homepage))
    if tmp != None:
        df_joined.set_value(index, 'hasHomepage', 1)
    else:
        df_joined.set_value(index, 'hasHomepage', 0)
df_joined = df_joined.drop(columns=['belongs_to_collection','adult','homepage'])
df_joined.head(5)

Unnamed: 0,budget,id,original_language,production_companies,production_countries,runtime,spoken_languages,vote_count,History,Western,...,Fantasy,War,Adventure,rating,director,actors,part_of_collection,+18,hasHomepage,18+
0,30000000,862,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",5415,0.0,0.0,...,0.0,0.0,0.0,3.59893,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",1,,0,0.0
1,65000000,8844,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",2413,0.0,0.0,...,1.0,0.0,1.0,3.760163,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",0,,0,0.0
2,60000000,949,en,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",1886,0.0,0.0,...,0.0,0.0,0.0,3.905544,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight...",0,,0,0.0
3,58000000,710,en,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",1194,0.0,0.0,...,0.0,0.0,1.0,2.740334,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam...",1,,1,0.0
4,98000000,1408,en,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",137,0.0,0.0,...,0.0,0.0,1.0,3.710181,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur...",0,,0,0.0


In [38]:
import json
pattern1 = "{'name': '([a-zA-Z ]*)'"
pattern2 = "'name': '([a-zA-Z ]*)'"

df_joined['productionCompanies'] = ""
df_joined['productionCountries'] = ""
df_joined['spokenLanguages'] = ""
for index, row in df_joined.iterrows():
    companies = []
    countries = []
    lang = []
    
    text1 = row.production_companies
    liste1 = text1.split("}")
    for i in liste1:
        tmp_name = re.search(pattern1, i)
        if tmp_name != None:
            companies.append(tmp_name.group(1))
    df_joined.set_value(index, 'productionCompanies', ','.join(map(str, companies)))
    
    text2 = row.production_countries
    liste2 = text2.split("}")
    for i in liste2:
        tmp_name = re.search(pattern2, i)
        if tmp_name != None:
            countries.append(tmp_name.group(1))
    df_joined.set_value(index, 'productionCountries', ','.join(map(str, countries)))
    
    text3 = row.spoken_languages
    liste3 = text3.split("}")
    for i in liste3:
        tmp_name = re.search(pattern2, i)
        if tmp_name != None:
            lang.append(tmp_name.group(1))
    df_joined.set_value(index, 'spokenLanguages', ','.join(map(str, lang)))
    
    
df_joined.head(5)

Unnamed: 0,budget,id,original_language,production_companies,production_countries,runtime,spoken_languages,vote_count,History,Western,...,rating,director,actors,part_of_collection,+18,hasHomepage,18+,productionCompanies,productionCountries,spokenLanguages
0,30000000,862,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",5415,0.0,0.0,...,3.59893,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",1,,0,0.0,Pixar Animation Studios,United States of America,English
1,65000000,8844,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",2413,0.0,0.0,...,3.760163,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",0,,0,0.0,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,English
2,60000000,949,en,"[{'name': 'Regency Enterprises', 'id': 508}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",1886,0.0,0.0,...,3.905544,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight...",0,,0,0.0,"Regency Enterprises,Forward Pass",United States of America,English
3,58000000,710,en,"[{'name': 'United Artists', 'id': 60}, {'name'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",1194,0.0,0.0,...,2.740334,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam...",1,,1,0.0,"United Artists,Eon Productions","United Kingdom,United States of America",English
4,98000000,1408,en,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",137,0.0,0.0,...,3.710181,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur...",0,,0,0.0,"Laurence Mark Productions,Carolco Pictures","France,Germany,Italy,United States of America","English,Latin"


In [39]:
df_joined = df_joined.drop(columns=['production_companies','production_countries','spoken_languages'])
df_joined.head(5)

Unnamed: 0,budget,id,original_language,runtime,vote_count,History,Western,Music,Family,Comedy,...,rating,director,actors,part_of_collection,+18,hasHomepage,18+,productionCompanies,productionCountries,spokenLanguages
0,30000000,862,en,81.0,5415,0.0,0.0,0.0,1.0,1.0,...,3.59893,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",1,,0,0.0,Pixar Animation Studios,United States of America,English
1,65000000,8844,en,104.0,2413,0.0,0.0,0.0,1.0,0.0,...,3.760163,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",0,,0,0.0,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,English
2,60000000,949,en,170.0,1886,0.0,0.0,0.0,0.0,0.0,...,3.905544,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight...",0,,0,0.0,"Regency Enterprises,Forward Pass",United States of America,English
3,58000000,710,en,130.0,1194,0.0,0.0,0.0,0.0,0.0,...,2.740334,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam...",1,,1,0.0,"United Artists,Eon Productions","United Kingdom,United States of America",English
4,98000000,1408,en,119.0,137,0.0,0.0,0.0,0.0,0.0,...,3.710181,Renny Harlin,"Geena Davis,Matthew Modine,Frank Langella,Maur...",0,,0,0.0,"Laurence Mark Productions,Carolco Pictures","France,Germany,Italy,United States of America","English,Latin"


### production companies, production countries, spoken_languages,original language --> hot encoding?
### what about continuous attributes? --> budget

In [40]:
# how many different original languages? *
print(str(len(df_joined.original_language.unique())) + " original languages")

# how many different spoken languages?
print(str(len(df_joined.spokenLanguages.unique())) + " spoken languages")

# how many different production companies? *
print(str(len(df_joined.productionCompanies.unique())) + " production companies")

# how many different production countries?
print(str(len(df_joined.productionCountries.unique())) + " production countries")

# max budget?
print("Max Budget: " + str(df_joined.budget.max()))

# min budget?
print("Min Budget: " + str(df_joined.budget.min()))

56 original languages
177 spoken languages
3906 production companies
691 production countries
Max Budget: 380000000
Min Budget: 0


### Hot Encoding for original languages (orig)

In [41]:
# lang_set = {""}
# for index, row in df_joined.iterrows():
#     lang_set.add("orig_" + str(row.original_language))      
# lang_set.remove("")

# for g in lang_set:
#     df_joined[g] = 0

# df_movie_lang = pd.DataFrame(columns=lang_set)
# for index, row in df_joined.iterrows():
#     temp = pd.Series(index=lang_set)
#     tmp_lang = "orig_" + str(row.original_language)
#     temp[tmp_lang] = 1
#     df_movie_lang = df_movie_lang.append(temp, ignore_index=True)

# # 3. add one hot encoded genres to df_movies
# df_joined[list(lang_set)] = df_movie_lang

# # 4. turn NaN values to zeros
# df_joined = df_joined.fillna(0)

# df_joined = df_joined.drop(columns=['original_language'])

# df_joined.head(3)

Unnamed: 0,budget,id,runtime,vote_count,History,Western,Music,Family,Comedy,Drama,...,orig_bg,orig_ml,orig_es,orig_bn,orig_ab,orig_wo,orig_ca,orig_ru,orig_id,orig_is
0,30000000,862,81.0,5415,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,65000000,8844,104.0,2413,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60000000,949,170.0,1886,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Hot Encoding for Production Countries (pcountry)

In [15]:
# df_movies_raw, am Ende --> df_joined
pc_set = {""}
for index, row in df_joined.iterrows():
    pcs = str(row.productionCountries).split(",")
    for pc in pcs: # turn string to json and loop over genres
        pc_set.add("pcountry_" + str(pc))       
pc_set.remove("")

for g in pc_set:
    df_joined[g] = 0

# 2. add genres as columns to dataframe, default value = 0
df_movie_pc = pd.DataFrame(columns=pc_set)
for index, row in df_joined.iterrows():
    temp = pd.Series(index=pc_set)
    tmp_pcs = []
    pcs = str(row.productionCountries).split(",")
    for pc in pcs:
        tmp_pcs.append("pcountry_" + str(pc))
    for pc in tmp_pcs:
        #temp[pc] = (1/len(tmp_pcs))
        temp[pc] = 1
    df_movie_pc = df_movie_pc.append(temp, ignore_index=True)

# 3. add one hot encoded genres to df_movies        
df_joined[list(pc_set)] = df_movie_pc

# 4. turn NaN values to zeros
df_joined = df_joined.fillna(0)

df_joined = df_joined.drop(columns=['productionCountries'])

df_joined.head(3)

Unnamed: 0,budget,id,runtime,vote_count,History,Western,Music,Family,Comedy,Drama,...,pcountry_Nepal,pcountry_Sweden,pcountry_Puerto Rico,pcountry_Armenia,pcountry_Serbia,pcountry_Moldova,pcountry_Brazil,pcountry_Denmark,pcountry_Iraq,pcountry_Uzbekistan
0,30000000,862,81.0,5415,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,65000000,8844,104.0,2413,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60000000,949,170.0,1886,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Hot Encoding for Production Companies (pcomp)

In [None]:
pland_set = {""}
for index, row in df_joined.iterrows():
    pcs = str(row.productionCompanies).split(",")
    for pc in pcs: # turn string to json and loop over genres
        pland_set.add("pcomp_" + str(pc))       
pland_set.remove("")

for g in pland_set:
    df_joined[g] = 0

# 2. add genres as columns to dataframe, default value = 0
df_movie_pc = pd.DataFrame(columns=pland_set)
for index, row in df_joined.iterrows():
    temp = pd.Series(index=pland_set)
    tmp_pcs = []
    pcs = str(row.productionCompanies).split(",")
    for pc in pcs:
        tmp_pcs.append("pcomp_" + str(pc))
    for pc in tmp_pcs:
        #temp[pc] = (1/len(tmp_pcs))
        temp[pc] = 1
    df_movie_pc = df_movie_pc.append(temp, ignore_index=True)

# 3. add one hot encoded genres to df_movies        
df_joined[list(pland_set)] = df_movie_pc

# 4. turn NaN values to zeros
df_joined = df_joined.fillna(0)

df_joined = df_joined.drop(columns=['productionCompanies'])

df_joined.head(3)

### Hot Encoding for spoken Languages (slang)

In [None]:
slang_set = {""}
for index, row in df_joined.iterrows():
    pcs = str(row.spokenLanguages).split(",")
    for pc in pcs: # turn string to json and loop over genres
        slang_set.add("slang_" + str(pc))       
slang_set.remove("")

for g in slang_set:
    df_joined[g] = 0

# 2. add genres as columns to dataframe, default value = 0
df_movie_pc = pd.DataFrame(columns=slang_set)
for index, row in df_joined.iterrows():
    temp = pd.Series(index=slang_set)
    tmp_pcs = []
    pcs = str(row.spokenLanguages).split(",")
    for pc in pcs:
        tmp_pcs.append("slang_" + str(pc))
    for pc in tmp_pcs:
        #temp[pc] = (1/len(tmp_pcs))
        temp[pc] = 1
    df_movie_pc = df_movie_pc.append(temp, ignore_index=True)

# 3. add one hot encoded genres to df_movies        
df_joined[list(slang_set)] = df_movie_pc

# 4. turn NaN values to zeros
df_joined = df_joined.fillna(0)

df_joined = df_joined.drop(columns=['spokenLanguages'])

df_joined.head(3)

### director, actors?
##### auch Anzahl der Ratings kann eine Rolle spielen

In [None]:
# production companies, production countries, spoken_languages --> hot encoding?
# first get list of all actoors
df_credits1 = df_credits
dict_actors = {}
occurences_by_actor= []
for index, row in df_credits1.iterrows():
    liste_actors = row.actors.split(",")
    for i in liste_actors:
        if i in dict_actors:
            dict_actors[i] = dict_actors[i] + 1
        else:
            dict_actors[i] = 1

# now that we got a list let's check how many actors there are and in how many movies they played
print("Overall NUmber of Actors:")
print(len(dict_actors))
print("Number of different occurances:")
print(list(set(dict_actors.values())))
            
# what about continuous attributes? --> budget
# original language: en = 1, else=0?

#del dict_actors['']
#del dict_actors[' Jr.']
df_actors_plays = pd.DataFrame.from_dict(list(dict_actors.items()))

df_actors_plays.columns

#print(df_actors_plays.sort_values(by=1, ascending=False))
print(df_actors_plays.count())

df_actors_plays1 = df_actors_plays[df_actors_plays[1] > 70]
print(df_actors_plays1.count())



#print(df_actors_plays1.sort_values(by=1, ascending=False))

#df_actors_plays1.sort_values(by=1).plot(kind='line',x=0, y=1, figsize=(10, 50))

In [42]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
director_target = label_encoder.fit_transform(df_joined['director'])
df_joined = df_joined.drop(columns=['director'])
df_joined['director'] = director_target

In [43]:
print("Length before export: " + str(len(df_joined)))
# save to file
df_joined.to_csv("regressionPreprocessing.csv", index=False)

Length before export: 7517
