In [1]:
from __future__ import print_function
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import gzip
import numpy as np
import seaborn as sns
import itertools
from IPython.display import Markdown, display
from mpl_toolkits.mplot3d import axes3d, Axes3D  # <-- Note the capitalization!
%matplotlib inline

In [2]:
from pathlib import Path
import sys

def load_pexplorer():   
    link_path = "pexplorer"
    q = Path(link_path)
    while not q.exists():
        link_path = "../" + link_path
        q = Path(link_path)
        if len(link_path) > 500: raise Exception("Directory pexplorer not found.") 
    sys.path.append(link_path)
    import pexplorer as px
    return px

px = load_pexplorer()

![](images/Data-Schema-Capstone.png)

In [3]:
def load_file(filename,names, title):
    user_info = pd.read_csv("data/" + filename, sep='\t', encoding= 'latin', names = names)
    display(Markdown("### " + title +" Table \n"))
    display(user_info.head(5))
    display(Markdown("\n-----------"))
    return user_info

In [4]:
user_info_genres = load_file('title_genres.tsv', ['titleId','genres'], 'Genres')
user_info_ratings = load_file('title_ratings.tsv', ["titleId","rating","ratingCount","topRank","bottomRank","topRankTV"], 'Rating')
user_info_display = load_file('title_display.tsv', ["titleId","title","year","adult","runtimeMinutes","imageUri","imageId","type","originalTitle"], 'Display')
user_info_noms = load_file('award_noms.tsv', ["awardId","eventId","event","eventEditionId","award","category","year"], 'Nomination')
user_info_awards = load_file('title_awards.tsv', ["titleId","awardId","winner"], 'Awards')
user_info_releases = load_file('title_releases.tsv', ["titleId","ordering","date","region","premiere","wide","premiereType","festival","attributes"], 'Releases')

### Genres Table 


Unnamed: 0,titleId,genres
0,tt0015724,DramaMysteryRomanceThriller
1,tt0035423,ComedyFantasyRomance
2,tt0059900,DramaFantasy
3,tt0064994,ComedyDramaRomance
4,tt0065188,Drama



-----------

### Rating Table 


Unnamed: 0,titleId,rating,ratingCount,topRank,bottomRank,topRankTV
0,tt0015724,6.2,19,\N,\N,\N
1,tt0035423,6.4,72032,3107,2579,\N
2,tt0059900,6.8,21,\N,\N,\N
3,tt0064994,7.6,1387,\N,\N,\N
4,tt0065188,6.7,19,\N,\N,\N



-----------

### Display Table 


Unnamed: 0,titleId,title,year,adult,runtimeMinutes,imageUri,imageId,type,originalTitle
0,tt0015724,Dama de noche,1993,0,102,https://m.media-amazon.com/images/M/MV5BODY4NDE4Nzg2NV5BMl5BanBnXkFtZTYwMDEyNjk5._V1_.jpg,rm615620352,movie,Dama de noche
1,tt0035423,Kate & Leopold,2001,0,118,https://m.media-amazon.com/images/M/MV5BNmNlN2VlOTctYTdhMS00NzUxLTg0ZGMtYWE2ZTJmMThlMTk2XkEyXkFqcGdeQXVyMzI0NDc4ODY@._V1_.jpg,rm2171875072,movie,Kate & Leopold
2,tt0059900,"Wenn du groÃ bist, lieber Adam",1990,0,78,https://m.media-amazon.com/images/M/MV5BMTYzOTEzNTQ3Ml5BMl5BanBnXkFtZTcwNjM4MzAyMQ@@._V1_.jpg,rm2847710208,movie,"Wenn du groÃ bist, lieber Adam"
3,tt0064994,Larks on a String,1990,0,94,https://m.media-amazon.com/images/M/MV5BMTE5OTkwNDc0MF5BMl5BanBnXkFtZTcwNTg0MDQyMQ@@._V1_.jpg,rm905747712,movie,SkrivÃ¡nci na niti
4,tt0065188,"Vojtech, receny sirotek",1990,0,80,\N,\N,movie,"Vojtech, receny sirotek"



-----------

### Nomination Table 


Unnamed: 0,awardId,eventId,event,eventEditionId,award,category,year
0,an0015278,ev0000206,Daytime Emmy Awards,ee0021710,Daytime Emmy,Outstanding Drama Series,1990
1,an0015279,ev0000206,Daytime Emmy Awards,ee0021710,Daytime Emmy,Outstanding Drama Series,1990
2,an0015280,ev0000206,Daytime Emmy Awards,ee0021710,Daytime Emmy,Outstanding Drama Series,1990
3,an0015281,ev0000206,Daytime Emmy Awards,ee0021710,Daytime Emmy,Outstanding Drama Series,1990
4,an0015283,ev0000206,Daytime Emmy Awards,ee0021710,Daytime Emmy,Outstanding Animated Program,1990



-----------

### Awards Table 


Unnamed: 0,titleId,awardId,winner
0,tt0015724,an0322954,0
1,tt0035423,an0033807,0
2,tt0035423,an0055446,0
3,tt0035423,an0063409,0
4,tt0035423,an0063447,1



-----------

### Releases Table 


Unnamed: 0,titleId,ordering,date,region,premiere,wide,premiereType,festival,attributes
0,tt0015724,1,1993-03-18,MX,0,1,\N,\N,\N
1,tt0015724,2,0000-00-00,US,0,0,\N,Chicago International Film Festival,\N
2,tt0035423,10,2002-02-14,NL,0,1,\N,\N,\N
3,tt0035423,11,2002-02-22,TR,0,1,\N,\N,\N
4,tt0035423,1,2001-12-11,US,1,0,\N,\N,"Los Angeles, California"



-----------

In [5]:
user_info_genres["genres_list"] = user_info_genres.genres.apply(lambda x: x.split("\x02"))

In [6]:
user_info_genres

Unnamed: 0,titleId,genres,genres_list
0,tt0015724,DramaMysteryRomanceThriller,"[Drama, Mystery, Romance, Thriller]"
1,tt0035423,ComedyFantasyRomance,"[Comedy, Fantasy, Romance]"
2,tt0059900,DramaFantasy,"[Drama, Fantasy]"
3,tt0064994,ComedyDramaRomance,"[Comedy, Drama, Romance]"
4,tt0065188,Drama,[Drama]
...,...,...,...
140865,tt9056326,ComedyFamilyMusical,"[Comedy, Family, Musical]"
140866,tt9056328,ComedyFamilyMusical,"[Comedy, Family, Musical]"
140867,tt9056334,ComedyFamilyMusical,"[Comedy, Family, Musical]"
140868,tt9056338,ComedyFamilyMusical,"[Comedy, Family, Musical]"


In [7]:
df = pd.read_parquet("df_nonoms.parquet")

In [8]:
px.check_df(df)

Unnamed: 0_level_0,col. name,type,unique,NAN(%),min,max,mean,std,25%,50%,75%,binary values
Num.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1),index,int64,52138,0.0,0,141540,85894.059553,43235.201934,50675.500000,98856.500000,124024.750000,-
2),titleId,object,52138,0.0,-,-,-,-,-,-,-,-
3),genres,object,2166,0.083567,-,-,-,-,-,-,-,-
4),rating,float64,92,0.083567,1.000000,10.000000,6.243308,1.395214,5.400000,6.400000,7.200000,-
5),ratingCount,float64,6695,0.083567,5.000000,1998757.000000,5153.560327,40946.350238,18.000000,70.000000,389.000000,-
6),title,object,46133,0.083567,-,-,-,-,-,-,-,-
7),year,object,18,0.083567,-,-,-,-,-,-,-,-
8),adult,float64,2,0.083567,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-
9),runtimeMinutes,object,296,0.083567,-,-,-,-,-,-,-,-
10),imageId,object,37293,0.083567,-,-,-,-,-,-,-,-


In [11]:
px.glimpse(df)

Rows: 52,138
Columns: 18


Unnamed: 0,dtype,sample values
titleId,object,"[tt0015724, tt0035423, tt0059900, tt0064994, tt0065188, tt0066498, tt0077432, tt0081145, tt0081721, tt0084015, tt0084548, tt0084551]"
genres,object,"[DramaMysteryRomanceThriller, ComedyFantasyRomance, DramaFantasy, ComedyDramaRomance, Drama, DramaThriller, ActionDrama, ComedyCrimeDramaFamily, DramaFamilyFantasy, CrimeDramaThriller, ComedyDramaFantasyHorrorRomance, BiographyDocumentaryHistory]"
rating,float64,"[6.2, 6.4, 6.8, 7.6, 6.7, 7.9, 5.3, 6.5, 7.3, 8.1, 7.5, 5.6]"
ratingCount,float64,"[19.0, 72032.0, 21.0, 1387.0, 1995.0, 6.0, 204.0, 276.0, 39.0, 2549.0, 9.0, 177.0]"
title,object,"[Dama de noche, Kate & Leopold, Wenn du groÃ bist, lieber Adam, Larks on a String, Vojtech, receny sirotek, The Ear, Dip huet kei bing, Me and the Kid, Vincent and Me, Goodbye Paradise, Interrogation, The Buddhist Spell]"
...,...,...
premiere,float64,"[0.0, 1.0, nan]"
wide,float64,"[1.0, 0.0, nan]"
attributes,object,"[\N, Split, Portuguese Cinematheque, Bombay, Madrid, Amsterdam, Tallinn, New York Museum of Modern Art, New York City, New York, Torontotheatrical release, Internationale Hofer Filmtage, Cannes Film Market]"
awardId,object,"[an0322954, an0033807, an0058812, an0058789, an0253498, an0044440, None, an0015473, an0044411, an0122182, an0280683, an0267235]"


In [146]:
def relat_rat(x):
    try:
        cons = x.rating * np.log(np.sqrt(x.ratingCount / (10 - x.rating)) / (2007 - int(x.year)))
    except:
        cons = 0
    return cons

In [147]:
df["relat_rat"] = df.apply(lambda x: relat_rat(x), axis=1)

In [154]:
df_c = df[["year", "title", "relat_rat", "rating"]].sort_values("relat_rat", ascending=False).reset_index(drop=True)