## Movie Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Renaming columns while importing file
columnNames = ['star_rating','title','content_rating','genre','duration','actors_list']
moviesData = pd.read_csv('movies.csv', header=0, names=columnNames)
# also, we can do:
# movies = pd.read_csv(file)
# movies.columns = movies.columns.str.replace(' ','_')

In [7]:
moviesData.describe(include=['object'])
# here we can identify if there is empty cells 
# (title has 979 but content_rating has only 976)

Unnamed: 0,title,content_rating,genre,actors_list
count,979,976,979,979
unique,975,12,16,969
top,The Girl with the Dragon Tattoo,R,Drama,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,2,460,278,6


In [15]:
# o retorno de sort_values é por padrão uma serie
# ascending define a ordem, no caso descendente
# pulo do gato = utilizando title com dot notation retorna uma série
# para retornar o dataframe, deve usar como argumento do sort
moviesData.title.sort_values(ascending=False).head(10)

864                 [Rec]
526                  Zulu
615            Zombieland
677                Zodiac
955      Zero Dark Thirty
535                 Zelig
280    Young Frankenstein
96                Yojimbo
235               Yip Man
403            Ying xiong
Name: title, dtype: object

In [16]:
# usando title como argumento do sort = retornar dataframe completo
# apenas ordenar não altera o dataframe principal
moviesData.sort_values('title',ascending=False).head(10)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
864,7.5,[Rec],R,Horror,78,"[u'Manuela Velasco', u'Ferran Terraza', u'Jorg..."
526,7.8,Zulu,UNRATED,Drama,138,"[u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac..."
615,7.7,Zombieland,R,Comedy,88,"[u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha..."
677,7.7,Zodiac,R,Crime,157,"[u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M..."
955,7.4,Zero Dark Thirty,R,Drama,157,"[u'Jessica Chastain', u'Joel Edgerton', u'Chri..."
535,7.8,Zelig,PG,Comedy,79,"[u'Woody Allen', u'Mia Farrow', u'Patrick Horg..."
280,8.1,Young Frankenstein,PG,Comedy,106,"[u'Gene Wilder', u'Madeline Kahn', u'Marty Fel..."
96,8.4,Yojimbo,UNRATED,Action,110,"[u'Toshir\xf4 Mifune', u'Eijir\xf4 T\xf4no', u..."
235,8.1,Yip Man,R,Action,106,"[u'Donnie Yen', u'Simon Yam', u'Siu-Wong Fan']"
403,7.9,Ying xiong,PG-13,Action,99,"[u'Jet Li', u'Tony Chiu Wai Leung', u'Maggie C..."


In [17]:
# ordena 1ºgênero, 2ºduração
moviesData.sort_values(['genre','duration']).head(10)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
533,7.8,Run Lola Run,R,Action,80,"[u'Franka Potente', u'Moritz Bleibtreu', u'Her..."
633,7.7,The Warriors,R,Action,92,"[u'Michael Beck', u'James Remar', u'Dorsey Wri..."
455,7.9,Taken,PG-13,Action,93,"[u'Liam Neeson', u'Maggie Grace', u'Famke Jans..."
685,7.7,First Blood,R,Action,93,"[u'Sylvester Stallone', u'Brian Dennehy', u'Ri..."
744,7.6,Mad Max 2: The Road Warrior,R,Action,94,"[u'Mel Gibson', u'Bruce Spence', u'Michael Pre..."
619,7.7,Forbidden Planet,PASSED,Action,98,"[u'Walter Pidgeon', u'Anne Francis', u'Leslie ..."
276,8.1,A Fistful of Dollars,R,Action,99,"[u'Clint Eastwood', u'Gian Maria Volont\xe9', ..."
403,7.9,Ying xiong,PG-13,Action,99,"[u'Jet Li', u'Tony Chiu Wai Leung', u'Maggie C..."
552,7.8,Kung Fu Hustle,R,Action,99,"[u'Stephen Chow', u'Wah Yuen', u'Qiu Yuen']"
740,7.6,The Raid: Redemption,R,Action,101,"[u'Iko Uwais', u'Ananda George', u'Ray Sahetapy']"


# Filtrando o DataFrame

In [26]:
# confere se o filme possui duração > 200 minutos -> gera lista
booleanos = []
for item in moviesData.duration:
    if item >= 200:
        booleanos.append(True)
    else:
        booleanos.append(False)
# converte para Serie        
duracao = pd.Series(booleanos)

# aplicando a serie no dataframe para retornar quais filmes > 200 min
# retornando somente 2 colunas
moviesData[duracao][['title','star_rating']]

Unnamed: 0,title,star_rating
2,The Godfather: Part II,9.1
7,The Lord of the Rings: The Return of the King,8.9
17,Seven Samurai,8.7
78,Once Upon a Time in America,8.4
85,Lawrence of Arabia,8.4
142,Lagaan: Once Upon a Time in India,8.3
157,Gone with the Wind,8.2
204,Ben-Hur,8.1
445,The Ten Commandments,7.9
476,Hamlet,7.8


In [28]:
# forma simplificada -> converte direto pra serie
a = moviesData.duration >= 180
moviesData[a]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
8,8.9,Schindler's List,R,Biography,195,"[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kings..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
40,8.5,The Green Mile,R,Crime,189,"[u'Tom Hanks', u'Michael Clarke Duncan', u'Dav..."


In [30]:
# List comprehension - forma alternativa
moviesData[moviesData.duration >= 180].head()


Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
8,8.9,Schindler's List,R,Biography,195,"[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kings..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
40,8.5,The Green Mile,R,Crime,189,"[u'Tom Hanks', u'Michael Clarke Duncan', u'Dav..."


In [31]:
# Retornando com a coluna desejada -> já vem serie
gender_over200 = moviesData[moviesData.duration >= 200].genre

In [None]:
# Método LOC - definir condição e filtrar por labels, é melhor, retorna serie 
moviesData.loc[moviesData.duration >= 200, 'genre']