In [1]:
import pandas as pd
import re
import seaborn as sb
import matplotlib.pyplot as plt
import datetime
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df_movies = pd.read_csv("the-movies-dataset/movies_metadata.csv")
print("Length at import: " + str(len(df_movies)))
#df_movies.head(3)
# filter movies on Status == 'Released' and drop status column
df_movies = df_movies[df_movies["status"] == 'Released']
df_movies.drop(columns=["status"])
print("Length only released: " + str(len(df_movies)))
#df_movies = df_movies[0:1000]
print("Length: " + str(len(df_movies)))
df_movies_raw = df_movies.copy()

Length at import: 45463
Length only released: 45017
Length: 45017


In [3]:
genres_stat = {}
for index, row in df_movies.iterrows():
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres: # turn string to json and loop over genres
        genres_stat.update({genre.get("name") : 0})

#Get the movie count per genre
movies = {}
for index, row in df_movies.iterrows():
    tmp_genres = []  
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres:
        tmp_genres.append(genre.get("name"))
        new_value = genres_stat.get(genre.get("name")) + 1
        genres_stat.update({genre.get("name") : new_value})
    movies.update({row.id:{'title': row.title, 'genres': tmp_genres}})
print(len(movies))

for key, value in sorted(genres_stat.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

44988
TV Movie: 760
Western: 1038
War: 1318
History: 1391
Foreign: 1588
Music: 1588
Animation: 1917
Fantasy: 2294
Mystery: 2456
Family: 2743
Science Fiction: 3020
Adventure: 3474
Documentary: 3874
Crime: 4286
Horror: 4637
Action: 6548
Romance: 6668
Thriller: 7572
Comedy: 13079
Drama: 20079


In [4]:
#genre_set = {"Action","Drama","Comedy","Thriller","Animation"}

genre_set = {""}
for index, row in df_movies.iterrows():
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres: # turn string to json and loop over genres
        genre_set.add(genre.get("name"))       
genre_set.remove("")

for g in genre_set:
    df_movies[g] = 0

# 2. add genres as columns to dataframe, default value = 0
df_movie_genres = pd.DataFrame(columns=genre_set)
for index, row in df_movies.iterrows():
    temp = pd.Series(index=genre_set)
    tmp_genres = []  
    genres = json.loads(re.sub("'", '"',row.genres))
    for genre in genres:
        if genre.get("name") in list(genre_set):
            tmp_genres.append(genre.get("name"))
    for genre in tmp_genres: # turn string to json and loop over genres
        #temp[genre] = (1/len(tmp_genres))
        temp[genre] = 1
    df_movie_genres = df_movie_genres.append(temp, ignore_index=True)
#print("One hot encoding finished")

# 3. add one hot encoded genres to df_movies        
df_movies[list(genre_set)] = df_movie_genres

# 4. turn NaN values to zeros
df_movies = df_movies.fillna(0)

In [6]:
#df_movies = df_movies.drop(columns=['genres','imdb_id','original_title','overview','popularity','poster_path','release_date','revenue','status','tagline','title','video','vote_count','vote_average'])
df_movies = df_movies.drop(columns=['genres','original_title','overview','popularity','poster_path','release_date','revenue','status','tagline','title','video','vote_average'])
df_movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,production_companies,production_countries,runtime,...,Horror,Western,War,Music,Science Fiction,Drama,Comedy,Animation,Mystery,Crime
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,False,0,65000000,0,8844,tt0113497,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,0,15602,tt0113228,en,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [7]:
# durchschnitt von predictions berechnen
df_links = pd.read_csv("the-movies-dataset/links.csv")
print("Length at import (Links): " + str(len(df_links)))

df_ids = pd.DataFrame(columns=['ratingID'])
df_ids.ratingID = df_ids.ratingID.astype('int64')
pattern = "^tt(\d*)"
for index, row in df_movies.iterrows():
    temp = pd.Series(['ratingID'])
    newValue = str(row.imdb_id).replace("t","",-1)
    temp['ratingID'] = int(newValue)
    df_ids = df_ids.append(temp, ignore_index=True)
df_movies['ratingID'] = df_ids['ratingID']
print("Länge: ", len(df_ids['ratingID']))

df_tmp = pd.merge(left=df_movies,right=df_links, left_on='ratingID', right_on='imdbId', how='inner')
#print(df_tmp.head(5))
print("Length after Merge (Movies + Links): " + str(len(df_tmp)))

#######
df_ratings = pd.read_csv("the-movies-dataset/ratings.csv")
#df_ratings = df_ratings[0:100000]
print("Length at import (Ratings): " + str(len(df_ratings)))
df_ratings_grouped = df_ratings.groupby('movieId')[['rating']].mean()
#print(df_ratings_grouped.head(5))
df_joined = pd.merge(left=df_tmp,right=df_ratings_grouped, left_on='movieId', right_on='movieId', how='inner')
print("Length after merge (Movies + Links + Ratings): " + str(len(df_joined)))

Length at import (Links): 45843
Länge:  45017
Length after Merge (Movies + Links): 44508
Length at import (Ratings): 26024289
Length after merge (Movies + Links + Ratings): 43803


In [8]:
## Export: ID + Genres
df_export = df_joined.copy()
features_to_remove = ['id','imdb_id','movieId','tmdbId','ratingID','production_countries','production_companies','adult','belongs_to_collection','budget','homepage','runtime','original_language','spoken_languages','vote_count']
for i in features_to_remove:
    if i in df_export.columns:
        df_export = df_export.drop(columns=i)
df_export.to_csv("clusterPreprocessing.csv", index=False)

In [9]:
# cast und crew integrieren
df_credits = pd.read_csv("the-movies-dataset/credits.csv")
df_credits.head(5)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [10]:
pattern = '\"Director\", \"name\": \"([a-zA-Z ]*)\", \"'
directors = set()
df_credits['director'] = ""
for index, row in df_credits.iterrows():
    text = row.crew
    text = text.replace('\\','',-1)
    text = text.replace("'",'"',-1)
    a = re.search(pattern, text)
    if a != None:
        directors.add(a.group(1))
        df_credits.set_value(index, 'director', a.group(1))
print(len(directors))
#print(directors)
df_credits.head(10)

14945


Unnamed: 0,cast,crew,id,director
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,John Lasseter
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,Joe Johnston
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,Howard Deutch
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,Forest Whitaker
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,Charles Shyer
5,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949,Michael Mann
6,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...",11860,Sydney Pollack
7,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",45325,Peter Hewitt
8,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...",9091,Peter Hyams
9,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710,Martin Campbell


In [11]:
import json
pattern = '\"name\": \"(.*)\", '
df_credits['actors'] = ""
for index, row in df_credits.iterrows():
    actors = []
    text = row.cast
    text = text.replace('\\','',-1)
    text = text.replace("'",'"',-1)
    liste = text.split("cast_id")
    for i in liste:
        tmp_name = re.search(pattern, i)
        if tmp_name != None:
            actors.append(tmp_name.group(1))
    df_credits.set_value(index, 'actors', ','.join(map(str, actors)))
df_credits.head(10)

Unnamed: 0,cast,crew,id,director,actors
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ..."
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,..."
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl..."
5,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949,Michael Mann,"Al Pacino,Robert De Niro,Val Kilmer,Jon Voight..."
6,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...",11860,Sydney Pollack,"Harrison Ford,Julia Ormond,Greg Kinnear,Angie ..."
7,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",45325,Peter Hewitt,"Jonathan Taylor Thomas,Brad Renfro,Rachael Lei..."
8,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...",9091,Peter Hyams,"Jean-Claude Van Damme,Powers Boothe,Dorian Har..."
9,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam..."


In [12]:
df_credits = df_credits.drop(columns=['cast','crew'])
df_joined = pd.merge(left=df_joined,right=df_credits, left_on='id', right_on='id', how='inner')
print("Length after merge (Movies + Ratings + Credits): " + str(len(df_joined)))
df_joined.head(5)

Length after merge (Movies + Ratings + Credits): 43872


Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,production_companies,production_countries,runtime,...,Animation,Mystery,Crime,ratingID,movieId,imdbId,tmdbId,rating,director,actors
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,...,1.0,0.0,0.0,114709.0,1,114709,862.0,3.888157,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal..."
1,False,0,65000000,0,8844,tt0113497,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,...,0.0,0.0,0.0,113497.0,2,113497,8844.0,3.236953,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,0,15602,tt0113228,en,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,...,0.0,0.0,0.0,113228.0,3,113228,15602.0,3.17555,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ..."
3,False,0,16000000,0,31357,tt0114885,en,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",127.0,...,0.0,0.0,0.0,114885.0,4,114885,31357.0,2.875713,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,0,11862,tt0113041,en,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",106.0,...,0.0,0.0,0.0,113041.0,5,113041,11862.0,3.079565,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl..."


In [13]:
# belongs to, homepage --> binär
df_joined['part_of_collection'] = ""
df_joined['+18'] = ""
df_joined['hasHomepage'] = ""
for index, row in df_joined.iterrows():
    if row.belongs_to_collection == 0:
        df_joined.set_value(index, 'part_of_collection', 0)
    else:
        df_joined.set_value(index, 'part_of_collection', 1)
    
    if row.adult == "True":
        df_joined.set_value(index, '18+', 1)
    else:
        df_joined.set_value(index, '18+', 0)   
    
    pattern = 'www.'
    tmp = re.search(pattern, str(row.homepage))
    if tmp != None:
        df_joined.set_value(index, 'hasHomepage', 1)
    else:
        df_joined.set_value(index, 'hasHomepage', 0)
df_joined = df_joined.drop(columns=['belongs_to_collection','adult','homepage'])
df_joined.head(5)

Unnamed: 0,budget,id,imdb_id,original_language,production_companies,production_countries,runtime,spoken_languages,vote_count,Action,...,movieId,imdbId,tmdbId,rating,director,actors,part_of_collection,+18,hasHomepage,18+
0,30000000,862,tt0114709,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",5415,0.0,...,1,114709,862.0,3.888157,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",1,,0,0.0
1,65000000,8844,tt0113497,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",2413,0.0,...,2,113497,8844.0,3.236953,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",0,,0,0.0
2,0,15602,tt0113228,en,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",92,0.0,...,3,113228,15602.0,3.17555,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ...",1,,0,0.0
3,16000000,31357,tt0114885,en,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",34,0.0,...,4,114885,31357.0,2.875713,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,...",0,,0,0.0
4,0,11862,tt0113041,en,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",173,0.0,...,5,113041,11862.0,3.079565,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl...",1,,0,0.0


In [14]:
import json
pattern1 = "{'name': '([a-zA-Z ]*)'"
pattern2 = "'name': '([a-zA-Z ]*)'"

df_joined['productionCompanies'] = ""
df_joined['productionCountries'] = ""
df_joined['spokenLanguages'] = ""
for index, row in df_joined.iterrows():
    companies = []
    countries = []
    lang = []
    
    text1 = row.production_companies
    liste1 = text1.split("}")
    for i in liste1:
        tmp_name = re.search(pattern1, i)
        if tmp_name != None:
            companies.append(tmp_name.group(1))
    df_joined.set_value(index, 'productionCompanies', ','.join(map(str, companies)))
    
    text2 = row.production_countries
    liste2 = text2.split("}")
    for i in liste2:
        tmp_name = re.search(pattern2, i)
        if tmp_name != None:
            countries.append(tmp_name.group(1))
    df_joined.set_value(index, 'productionCountries', ','.join(map(str, countries)))
    
    text3 = row.spoken_languages
    liste3 = text3.split("}")
    for i in liste3:
        tmp_name = re.search(pattern2, i)
        if tmp_name != None:
            lang.append(tmp_name.group(1))
    df_joined.set_value(index, 'spokenLanguages', ','.join(map(str, lang)))
    
    
df_joined.head(5)

Unnamed: 0,budget,id,imdb_id,original_language,production_companies,production_countries,runtime,spoken_languages,vote_count,Action,...,rating,director,actors,part_of_collection,+18,hasHomepage,18+,productionCompanies,productionCountries,spokenLanguages
0,30000000,862,tt0114709,en,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",5415,0.0,...,3.888157,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",1,,0,0.0,Pixar Animation Studios,United States of America,English
1,65000000,8844,tt0113497,en,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",2413,0.0,...,3.236953,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",0,,0,0.0,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,English
2,0,15602,tt0113228,en,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",92,0.0,...,3.17555,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ...",1,,0,0.0,Lancaster Gate,United States of America,English
3,16000000,31357,tt0114885,en,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",34,0.0,...,2.875713,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,...",0,,0,0.0,Twentieth Century Fox Film Corporation,United States of America,English
4,0,11862,tt0113041,en,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",173,0.0,...,3.079565,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl...",1,,0,0.0,"Sandollar Productions,Touchstone Pictures",United States of America,English


In [15]:
df_joined = df_joined.drop(columns=['production_companies','production_countries','spoken_languages'])
df_joined.head(5)

Unnamed: 0,budget,id,imdb_id,original_language,runtime,vote_count,Action,Family,Documentary,Adventure,...,rating,director,actors,part_of_collection,+18,hasHomepage,18+,productionCompanies,productionCountries,spokenLanguages
0,30000000,862,tt0114709,en,81.0,5415,0.0,1.0,0.0,0.0,...,3.888157,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",1,,0,0.0,Pixar Animation Studios,United States of America,English
1,65000000,8844,tt0113497,en,104.0,2413,0.0,1.0,0.0,1.0,...,3.236953,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",0,,0,0.0,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,English
2,0,15602,tt0113228,en,101.0,92,0.0,0.0,0.0,0.0,...,3.17555,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ...",1,,0,0.0,Lancaster Gate,United States of America,English
3,16000000,31357,tt0114885,en,127.0,34,0.0,0.0,0.0,0.0,...,2.875713,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,...",0,,0,0.0,Twentieth Century Fox Film Corporation,United States of America,English
4,0,11862,tt0113041,en,106.0,173,0.0,0.0,0.0,0.0,...,3.079565,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl...",1,,0,0.0,"Sandollar Productions,Touchstone Pictures",United States of America,English


### production companies, production countries, spoken_languages,original language --> hot encoding?
### what about continuous attributes? --> budget

In [16]:
# how many different original languages? *
print(str(len(df_joined.original_language.unique())) + " original languages")

# how many different spoken languages?
print(str(len(df_joined.spokenLanguages.unique())) + " spoken languages")

# how many different production companies? *
print(str(len(df_joined.productionCompanies.unique())) + " production companies")

# how many different production countries?
print(str(len(df_joined.productionCountries.unique())) + " production countries")

# max budget?
print("Max Budget: " + str(df_joined.budget.max()))

# min budget?
print("Min Budget: " + str(df_joined.budget.min()))

90 original languages
410 spoken languages
18396 production companies
2338 production countries
Max Budget: 380000000
Min Budget: 0


### Hot Encoding for original languages (orig)

In [17]:
def originalLanguagesEncoding():
    lang_set = {""}
    for index, row in df_joined.iterrows():
        lang_set.add("orig_" + str(row.original_language))      
    lang_set.remove("")

    for g in lang_set:
        df_joined[g] = 0

    df_movie_lang = pd.DataFrame(columns=lang_set)
    for index, row in df_joined.iterrows():
        temp = pd.Series(index=lang_set)
        tmp_lang = "orig_" + str(row.original_language)
        temp[tmp_lang] = 1
        df_movie_lang = df_movie_lang.append(temp, ignore_index=True)

    # 3. add one hot encoded genres to df_movies
    df_joined[list(lang_set)] = df_movie_lang

    # 4. turn NaN values to zeros
    df_joined = df_joined.fillna(0)

    df_joined = df_joined.drop(columns=['original_language'])

    df_joined.head(3)

### Hot Encoding for Production Countries (pcountry)

In [18]:
def productionCountriesEncoding():
    pc_set = {""}
    for index, row in df_joined.iterrows():
        pcs = str(row.productionCountries).split(",")
        for pc in pcs: # turn string to json and loop over genres
            pc_set.add("pcountry_" + str(pc))       
    pc_set.remove("")

    for g in pc_set:
        df_joined[g] = 0

    # 2. add genres as columns to dataframe, default value = 0
    df_movie_pc = pd.DataFrame(columns=pc_set)
    for index, row in df_joined.iterrows():
        temp = pd.Series(index=pc_set)
        tmp_pcs = []
        pcs = str(row.productionCountries).split(",")
        for pc in pcs:
            tmp_pcs.append("pcountry_" + str(pc))
        for pc in tmp_pcs:
            #temp[pc] = (1/len(tmp_pcs))
            temp[pc] = 1
        df_movie_pc = df_movie_pc.append(temp, ignore_index=True)

    # 3. add one hot encoded genres to df_movies        
    df_joined[list(pc_set)] = df_movie_pc

    # 4. turn NaN values to zeros
    df_joined = df_joined.fillna(0)

    df_joined = df_joined.drop(columns=['productionCountries'])

### Hot Encoding for Production Companies (pcomp)

In [19]:
def productionCompaniesEncoding():
    pland_set = {""}
    for index, row in df_joined.iterrows():
        pcs = str(row.productionCompanies).split(",")
        for pc in pcs: # turn string to json and loop over genres
            pland_set.add("pcomp_" + str(pc))       
    pland_set.remove("")

    for g in pland_set:
        df_joined[g] = 0

    # 2. add genres as columns to dataframe, default value = 0
    df_movie_pc = pd.DataFrame(columns=pland_set)
    for index, row in df_joined.iterrows():
        temp = pd.Series(index=pland_set)
        tmp_pcs = []
        pcs = str(row.productionCompanies).split(",")
        for pc in pcs:
            tmp_pcs.append("pcomp_" + str(pc))
        for pc in tmp_pcs:
            #temp[pc] = (1/len(tmp_pcs))
            temp[pc] = 1
        df_movie_pc = df_movie_pc.append(temp, ignore_index=True)

    # 3. add one hot encoded genres to df_movies        
    df_joined[list(pland_set)] = df_movie_pc

    # 4. turn NaN values to zeros
    df_joined = df_joined.fillna(0)

    df_joined = df_joined.drop(columns=['productionCompanies'])

### Hot Encoding for spoken Languages (slang)

In [20]:
def spokenLanguagesEncoding():
    slang_set = {""}
    for index, row in df_joined.iterrows():
        pcs = str(row.spokenLanguages).split(",")
        for pc in pcs: # turn string to json and loop over genres
            slang_set.add("slang_" + str(pc))       
    slang_set.remove("")

    for g in slang_set:
        df_joined[g] = 0

    # 2. add genres as columns to dataframe, default value = 0
    df_movie_pc = pd.DataFrame(columns=slang_set)
    for index, row in df_joined.iterrows():
        temp = pd.Series(index=slang_set)
        tmp_pcs = []
        pcs = str(row.spokenLanguages).split(",")
        for pc in pcs:
            tmp_pcs.append("slang_" + str(pc))
        for pc in tmp_pcs:
            #temp[pc] = (1/len(tmp_pcs))
            temp[pc] = 1
        df_movie_pc = df_movie_pc.append(temp, ignore_index=True)

    # 3. add one hot encoded genres to df_movies        
    df_joined[list(slang_set)] = df_movie_pc

    # 4. turn NaN values to zeros
    df_joined = df_joined.fillna(0)

    df_joined = df_joined.drop(columns=['spokenLanguages'])

### director, actors?
##### auch Anzahl der Ratings kann eine Rolle spielen

In [21]:
# production companies, production countries, spoken_languages --> hot encoding?
# first get list of all actoors
df_credits1 = df_credits
dict_actors = {}
occurences_by_actor= []
for index, row in df_credits1.iterrows():
    liste_actors = row.actors.split(",")
    for i in liste_actors:
        if i in dict_actors:
            dict_actors[i] = dict_actors[i] + 1
        else:
            dict_actors[i] = 1

# now that we got a list let's check how many actors there are and in how many movies they played
print("Overall Number of Actors:")
print(len(dict_actors))
print("Number of different occurances:")
print(list(set(dict_actors.values())))
            
# what about continuous attributes? --> budget
# original language: en = 1, else=0?

#del dict_actors['']
#del dict_actors[' Jr.']
df_actors_plays = pd.DataFrame.from_dict(list(dict_actors.items()))

df_actors_plays.columns

#print(df_actors_plays.sort_values(by=1, ascending=False))
print(df_actors_plays.count())

df_actors_plays1 = df_actors_plays[df_actors_plays[1] > 70]
print(df_actors_plays1.count())

#print(df_actors_plays1.sort_values(by=1, ascending=False))

#df_actors_plays1.sort_values(by=1).plot(kind='line',x=0, y=1, figsize=(10, 50))

Overall Number of Actors:
202735
Number of different occurances:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 101, 102, 104, 107, 108, 109, 110, 123, 125, 148, 241, 305, 2420]
0    202735
1    202735
dtype: int64
0    96
1    96
dtype: int64


In [25]:
## Nur Top Schauspieler aus Dict extrahieren
#del dict_actors['']
#del dict_actors[' Jr.']
actors = []

for k, v in dict_actors.items():
    if v > 70:
        actors.append(k)

print(len(actors))

94


In [31]:
def actorsEncoding(df):
    for actor in actors:
        df[actor] = 0

    # 2. add genres as columns to dataframe, default value = 0
    df_movie_actors = pd.DataFrame(columns=actors)
    for index, row in df.iterrows():
        temp = pd.Series(index=actors)
        tmp_actors = []
        actors_list = str(row.actors).split(",")
        for act in actors_list:
            tmp_actors.append("actor_" + str(act))
        for act in tmp_actors:
            temp[act] = 1
        df_movie_actors = df_movie_actors.append(temp, ignore_index=True)

    # 3. add one hot encoded genres to df_movies        
    df[list(actors)] = df_movie_actors

    # 4. turn NaN values to zeros
    df = df.fillna(0)

    df = df.drop(columns=['productionCountries'])
    return df

In [None]:
df_joined.to_csv("regressionPreprocessing_beforeEncoding.csv", index=False)

In [32]:
df_joined = pd.read_csv("regressionPreprocessing_beforeEncoding.csv")
#productionCountriesEncoding()
#productionCompaniesEncoding()
#spokenLanguagesEncoding()
df_joined = actorsEncoding(df_joined)
df_joined.head(3)

KeyboardInterrupt: 

In [None]:
df_joined.to_csv("regressionPreprocessing_beforeEncoding2.csv", index=False)

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
director_target = label_encoder.fit_transform(df_joined['director'])
df_joined = df_joined.drop(columns=['director'])
df_joined['director'] = director_target

In [36]:
features_to_remove = ['id','imdb_id','movieId','tmdbId','ratingID','production_countries','production_companies','original_language','spoken_languages','vote_count']
for i in features_to_remove:
    if i in df_joined.columns:
        df_joined = df_joined.drop(columns=i)

In [None]:
print("Length before export: " + str(len(df_joined)))
# save to file
df_joined.to_csv("regressionPreprocessing.csv", index=False)

### Preprocessing für Classification (Ratings wird in Labels konvertiert)

In [None]:
df_classification = df_joined.copy()
df_labels = pd.DataFrame(columns=['Rating_Label'])

for index, row in df_classification.iterrows():
    temp = pd.Series(['Rating_Label'])
    wert = row.rating
    if wert >= 4.5:
        temp['Rating_Label'] = 'Very Good'
    elif wert >= 4.0:
        temp['Rating_Label'] = 'Good'
    elif wert >= 3.0:
        temp['Rating_Label'] = 'Ok'
    elif wert >= 2.0: 
        temp['Rating_Label'] = 'Bad'
    else:
        temp['Rating_Label'] = 'Very Bad'
    df_labels = df_labels.append(temp, ignore_index=True)
df_classification['Rating_Label'] = df_labels['Rating_Label']
df_classification = df_classification.drop(columns=['rating'])
df_classification.head(5)

In [None]:
print("Length before export: " + str(len(df_classification)))
# save to file
df_classification.to_csv("classificationPreprocessing.csv", index=False)