In [1]:
#necessary libraries and modules to import

import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
#converts tsv files

tsv_file = './zippedData/rt.movie_info.tsv.gz'
convert = pd.read_table(tsv_file, delimiter='\t')
convert.to_csv('./zippedData/rt.movie_info.csv.gz')

tsv_file_r = './zippedData/rt.reviews.tsv.gz'
convert_r = pd.read_table(tsv_file_r, delimiter='\t', encoding = 'unicode_escape')
convert_r.to_csv('./zippedData/rt.reviews.csv.gz')

In [3]:
#importing batch data using os

csv_files = glob("./zippedData/*.csv.gz")
csv_files

['./zippedData\\bom.movie_gross.csv.gz',
 './zippedData\\imdb.name.basics.csv.gz',
 './zippedData\\imdb.title.akas.csv.gz',
 './zippedData\\imdb.title.basics.csv.gz',
 './zippedData\\imdb.title.crew.csv.gz',
 './zippedData\\imdb.title.principals.csv.gz',
 './zippedData\\rt.movie_info.csv.gz',
 './zippedData\\rt.reviews.csv.gz',
 './zippedData\\tmdb.movies.csv.gz',
 './zippedData\\tn.movie_budgets.csv.gz']

In [4]:
# cleaning the filenames

csv_files_dict = {} #create a dictionary of datasets
for filename in csv_files: #create a for loop to batch clean files
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") #remove .csv file extensions
    filename_df = pd.read_csv(filename, index_col=0, encoding='utf-8') 
    csv_files_dict[filename_cleaned] = filename_df #load .csv file as dataframe using col 1 as index and encode in utf-8 and save as the cleaned filename

In [5]:
print(csv_files_dict.keys())

dict_keys(['bom_movie_gross_gz', 'imdb_name_basics_gz', 'imdb_title_akas_gz', 'imdb_title_basics_gz', 'imdb_title_crew_gz', 'imdb_title_principals_gz', 'rt_movie_info_gz', 'rt_reviews_gz', 'tmdb_movies_gz', 'tn_movie_budgets_gz'])


In [6]:
movie_budgets_df = csv_files_dict['tn_movie_budgets_gz']
rotten_movie_genres_df = csv_files_dict['rt_movie_info_gz']
rotten_movie_reviews_df = csv_files_dict['rt_reviews_gz']
tmdb_movies_df = csv_files_dict['tmdb_movies_gz']
tmdb_genre_ids_df = pd.read_json('tmdb_genre_ids.json')

rotten_movie_genres_df.head()

rotten_movie_reviews_df.head()

In [7]:
tmdb_movies_df.iloc[0:50,:]

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186
5,"[12, 14, 10751]",32657,en,Percy Jackson & the Olympians: The Lightning T...,26.691,2010-02-11,Percy Jackson & the Olympians: The Lightning T...,6.1,4229
6,"[28, 12, 14, 878]",19995,en,Avatar,26.526,2009-12-18,Avatar,7.4,18676
7,"[16, 10751, 35]",10193,en,Toy Story 3,24.445,2010-06-17,Toy Story 3,7.7,8340
8,"[16, 10751, 35]",20352,en,Despicable Me,23.673,2010-07-09,Despicable Me,7.2,10057
9,"[16, 28, 35, 10751, 878]",38055,en,Megamind,22.855,2010-11-04,Megamind,6.8,3635


In [8]:
tmdb_movies_df.shape

(26517, 9)

In [9]:
tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].astype(str)

In [10]:
tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].str.strip('[]')
tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].str.strip('')

In [11]:
tmdb_movies_df

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"12, 14, 10751",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"14, 12, 16, 10751",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"12, 28, 878",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"16, 35, 10751",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"28, 878, 12",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...
26512,"27, 18",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,"18, 53",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,"14, 28, 12",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,"10751, 12, 28",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [12]:
#split the genre column to individual genre columns
#use a lambda function to split the genre list at the comma

tmdb_movies_df['genre_ids'] = tmdb_movies_df['genre_ids'].apply(lambda x: x.split(",") if x else x)

tmdb_movies_df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


ValueError: Can only compare identically-labeled Series objects

#create a set (unordered) of all genres (no duplicate values) using a for loop

all_genres = set()
for genres in tmdb_movies_df['genre_ids']: #for all genres in the move_details_df 'genres' column
    if genres: #if in genres
        all_genres.update(genres) #add to the set called all_genres

all_genres

#add new genre cols from all_genres set and fill with zero values using numpy with a for loop; 

for genre in all_genres: #for genre in all_genres set
    tmdb_movies_df[genre] = np.zeros(shape=tmdb_movies_df.shape[0]) 
        #create new genre column such that it is filled with zeroes 
        #with a shape equal to first dataframe col
                                
tmdb_movies_df.head() #return first 5 rows of dataframe

#set the genre value to 1 if the film is of that genre with a nested for loop

for index, row in tmdb_movies_df.iterrows(): #for index, row position in dataframe, iterating throw rows
    if row['genre_ids']: #if in genres col
        for genre in row['genre_ids']: #for genre in genres col
            tmdb_movies_df.loc[index, genre] = 1 #set the value of each matching index/genre position to 1

tmdb_movies_df.head() #return first 5 rows of dataframe

In [17]:
tmdb_genre_ids_df

Unnamed: 0,genres
0,"{'id': 28, 'name': 'Action'}"
1,"{'id': 12, 'name': 'Adventure'}"
2,"{'id': 16, 'name': 'Animation'}"
3,"{'id': 35, 'name': 'Comedy'}"
4,"{'id': 80, 'name': 'Crime'}"
5,"{'id': 99, 'name': 'Documentary'}"
6,"{'id': 18, 'name': 'Drama'}"
7,"{'id': 10751, 'name': 'Family'}"
8,"{'id': 14, 'name': 'Fantasy'}"
9,"{'id': 36, 'name': 'History'}"


In [18]:
keys = tmdb_genre_ids_df.genres.iloc[0].keys() #Get dictionary keys

#Keep track of columns we make for subsequent preview
genre_cols = []

#Create a new feature for each of these keys
for key in keys:
    new_col = 'genres_{}'.format(key) #Create new column name
    tmdb_genre_ids_df[new_col] = tmdb_genre_ids_df.genres.map(lambda x: x[key]) #Create a new column
    genre_cols.append(new_col)
    
tmdb_genre_ids_df[genre_cols]

Unnamed: 0,genres_id,genres_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


In [19]:
tmdb_genre_cols = tmdb_genre_ids_df[genre_cols]

In [20]:
tmdb_genre_cols

Unnamed: 0,genres_id,genres_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


In [21]:
genres_dict = dict(zip(tmdb_genre_cols['genres_id'], tmdb_genre_cols['genres_name']))
genres_dict

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [22]:
tmdb_movies_genres_df = tmdb_movies_df.rename(columns={'genres_dict'})
tmdb_movies_genres_df

TypeError: 'set' object is not callable

In [None]:
#tmdb_movies_df.rename(columns={tmdb_genre_cols['genres_id']:tmdb_genre_cols['genres_name']})
 

keys = tmdb_genre_cols['genres_id']
values = tmdb_genre_cols['genres_name']

genre_names_list = tmdb_genre_cols['genres_name'].tolist()
genre_names_set = set(genre_names_list)
genre_names_set

#add new genre cols from all_genres set and fill with zero values using numpy with a for loop; 

for genre in genre_names_set: #for genre in all_genres set
    tmdb_movies_df[genre] = np.zeros(shape=tmdb_movies_df.shape[0]) 
        #create new genre column such that it is filled with zeroes 
        #with a shape equal to first dataframe col
                                
tmdb_movies_df.head() #return first 5 rows of dataframe