In [1]:
import pandas as pd
import numpy as np
import altair as alt

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
mpg = pd.read_csv("../data/mpg.csv")
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
229,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
230,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
231,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
232,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [3]:
mpg.trans.unique()

array(['auto(l5)', 'manual(m5)', 'manual(m6)', 'auto(av)', 'auto(s6)',
       'auto(l4)', 'auto(l3)', 'auto(l6)', 'auto(s5)', 'auto(s4)'],
      dtype=object)

In [4]:
manual = mpg[mpg.trans.str.contains("manual")]
manual.to_csv("../data/manual.csv", index=False)

In [5]:
automatic = mpg[mpg.trans.str.contains("auto")]
automatic.to_csv("../data/automatic.csv", index=False)

In [6]:
mpg_all = pd.concat([manual, automatic])

In [7]:
melted_mpg = mpg[mpg.duplicated(subset=["manufacturer", "model", "year", "trans"]) == False].melt(id_vars = ["manufacturer", "model", "year", "trans"])
melted_mpg.to_csv("../data/melted_mpg.csv", index=False)

In [8]:
melted_mpg.pivot(index=["manufacturer", "model", "year", "trans"], columns="variable", values="value").reset_index()

variable,manufacturer,model,year,trans,class,cty,cyl,displ,drv,fl,hwy
0,audi,a4,1999,auto(l5),compact,18,4,1.8,f,p,29
1,audi,a4,1999,manual(m5),compact,21,4,1.8,f,p,29
2,audi,a4,2008,auto(av),compact,21,4,2.0,f,p,30
3,audi,a4,2008,manual(m6),compact,20,4,2.0,f,p,31
4,audi,a4 quattro,1999,auto(l5),compact,16,4,1.8,4,p,25
...,...,...,...,...,...,...,...,...,...,...,...
134,volkswagen,new beetle,2008,manual(m5),subcompact,20,5,2.5,f,r,28
135,volkswagen,passat,1999,auto(l5),midsize,18,4,1.8,f,p,29
136,volkswagen,passat,1999,manual(m5),midsize,21,4,1.8,f,p,29
137,volkswagen,passat,2008,auto(s6),midsize,19,4,2.0,f,p,28


In [9]:
mpg_class = mpg[["manufacturer", "model", "year", "class"]]
mpg_class = mpg_class[mpg_class.duplicated() == False]
mpg_class.to_csv("../data/mpg_class.csv", index=False)

In [10]:
mpg_data = mpg[["manufacturer", "model", "displ", "year", "cyl", "trans", "drv", "cty", "hwy", "fl"]]
mpg_data.to_csv("../data/mpg_no_class.csv", index=False)

In [11]:
pd.merge(mpg_data, mpg_class, how="left", on=["manufacturer", "model", "year"])

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
229,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
230,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
231,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
232,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [12]:
movies = pd.read_csv("../data/Pudding-Film-Dialogue.csv")
movies

Unnamed: 0,imdb_id,title,release_year,character,gender,words,proportion_of_dialogue,age,gross,runtimeMinutes,genres
0,tt0112579,The Bridges of Madison County,1995,Betty,woman,311,0.048639,35.0,142.0,135,Drama
1,tt0112579,The Bridges of Madison County,1995,Carolyn Johnson,woman,873,0.136534,,142.0,135,Drama
2,tt0112579,The Bridges of Madison County,1995,Eleanor,woman,138,0.021583,,142.0,135,Drama
3,tt0112579,The Bridges of Madison County,1995,Francesca Johns,woman,2251,0.352049,46.0,142.0,135,Drama
4,tt0112579,The Bridges of Madison County,1995,Madge,woman,190,0.029715,46.0,142.0,135,Drama
...,...,...,...,...,...,...,...,...,...,...,...
23042,tt0101414,Beauty and the Beast,1991,Lumiere,man,1063,0.104636,56.0,452.0,84,Animation
23043,tt0101414,Beauty and the Beast,1991,Maurice,man,1107,0.108967,71.0,452.0,84,Animation
23044,tt0101414,Beauty and the Beast,1991,Monsieur D'Arqu,man,114,0.011222,58.0,452.0,84,Animation
23045,tt0101414,Beauty and the Beast,1991,Mrs. Potts,woman,564,0.055517,66.0,452.0,84,Animation


In [13]:
movies[movies.title.str.contains("M[ae]n")].groupby("title").count()

Unnamed: 0_level_0,imdb_id,release_year,character,gender,words,proportion_of_dialogue,age,gross,runtimeMinutes,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3 Men and a Baby,11,11,11,11,11,11,6,11,11,11
A Few Good Men,20,20,20,20,20,20,18,20,20,20
A Man Apart,12,12,12,12,12,12,7,12,12,12
A Most Wanted Man,14,14,14,14,14,14,14,14,14,14
A Serious Man,14,14,14,14,14,14,4,14,14,14
A Single Man,7,7,7,7,7,7,6,7,7,7
All the Boys Love Mandy Lane,7,7,7,7,7,7,5,0,7,7
All the President's Men,12,12,12,12,12,12,11,0,12,12
Austin Powers: International Man of Mystery,10,10,10,10,10,10,10,10,10,10
Bloodfist VII: Manhunt,9,9,9,9,9,9,6,0,9,9


In [14]:
pd.to_datetime(movies.release_year, format="%Y")

0       1995-01-01
1       1995-01-01
2       1995-01-01
3       1995-01-01
4       1995-01-01
           ...    
23042   1991-01-01
23043   1991-01-01
23044   1991-01-01
23045   1991-01-01
23046   1991-01-01
Name: release_year, Length: 23047, dtype: datetime64[ns]

In [15]:
movies = movies.assign(release_year = pd.to_datetime(movies.release_year, format="%Y"))
movies

Unnamed: 0,imdb_id,title,release_year,character,gender,words,proportion_of_dialogue,age,gross,runtimeMinutes,genres
0,tt0112579,The Bridges of Madison County,1995-01-01,Betty,woman,311,0.048639,35.0,142.0,135,Drama
1,tt0112579,The Bridges of Madison County,1995-01-01,Carolyn Johnson,woman,873,0.136534,,142.0,135,Drama
2,tt0112579,The Bridges of Madison County,1995-01-01,Eleanor,woman,138,0.021583,,142.0,135,Drama
3,tt0112579,The Bridges of Madison County,1995-01-01,Francesca Johns,woman,2251,0.352049,46.0,142.0,135,Drama
4,tt0112579,The Bridges of Madison County,1995-01-01,Madge,woman,190,0.029715,46.0,142.0,135,Drama
...,...,...,...,...,...,...,...,...,...,...,...
23042,tt0101414,Beauty and the Beast,1991-01-01,Lumiere,man,1063,0.104636,56.0,452.0,84,Animation
23043,tt0101414,Beauty and the Beast,1991-01-01,Maurice,man,1107,0.108967,71.0,452.0,84,Animation
23044,tt0101414,Beauty and the Beast,1991-01-01,Monsieur D'Arqu,man,114,0.011222,58.0,452.0,84,Animation
23045,tt0101414,Beauty and the Beast,1991-01-01,Mrs. Potts,woman,564,0.055517,66.0,452.0,84,Animation


In [18]:
movies

Unnamed: 0,imdb_id,title,release_year,character,gender,words,proportion_of_dialogue,age,gross,runtimeMinutes,genres
0,tt0112579,The Bridges of Madison County,1995-01-01,Betty,woman,311,0.048639,35.0,142.0,135,Drama
1,tt0112579,The Bridges of Madison County,1995-01-01,Carolyn Johnson,woman,873,0.136534,,142.0,135,Drama
2,tt0112579,The Bridges of Madison County,1995-01-01,Eleanor,woman,138,0.021583,,142.0,135,Drama
3,tt0112579,The Bridges of Madison County,1995-01-01,Francesca Johns,woman,2251,0.352049,46.0,142.0,135,Drama
4,tt0112579,The Bridges of Madison County,1995-01-01,Madge,woman,190,0.029715,46.0,142.0,135,Drama
...,...,...,...,...,...,...,...,...,...,...,...
23042,tt0101414,Beauty and the Beast,1991-01-01,Lumiere,man,1063,0.104636,56.0,452.0,84,Animation
23043,tt0101414,Beauty and the Beast,1991-01-01,Maurice,man,1107,0.108967,71.0,452.0,84,Animation
23044,tt0101414,Beauty and the Beast,1991-01-01,Monsieur D'Arqu,man,114,0.011222,58.0,452.0,84,Animation
23045,tt0101414,Beauty and the Beast,1991-01-01,Mrs. Potts,woman,564,0.055517,66.0,452.0,84,Animation


In [25]:
movies = (movies.groupby(['title','gender','gross', 'runtimeMinutes', 'release_year', 'genres'])
                  .agg({'proportion_of_dialogue': 'sum', 'words': 'sum', 'age': 'mean'})
                  .reset_index())
movies

Unnamed: 0,title,gender,gross,runtimeMinutes,release_year,genres,proportion_of_dialogue,words,age
0,(500) Days of Summer,man,37.0,95,2009-01-01,Comedy,0.689838,12762,38.333333
1,(500) Days of Summer,woman,37.0,95,2009-01-01,Comedy,0.310162,5738,29.600000
2,10 Things I Hate About You,man,65.0,97,1999-01-01,Comedy,0.543089,10688,31.285714
3,10 Things I Hate About You,woman,65.0,97,1999-01-01,Comedy,0.456911,8992,22.000000
4,12 Years a Slave,man,60.0,134,2013-01-01,Biography,0.824129,16176,42.384615
...,...,...,...,...,...,...,...,...,...
3280,Zodiac,woman,41.0,157,2007-01-01,Crime,0.096957,1421,32.333333
3281,eXistenZ,man,4.0,97,1999-01-01,Horror,0.602837,5695,42.000000
3282,eXistenZ,woman,4.0,97,1999-01-01,Horror,0.397163,3752,28.500000
3283,xXx,man,211.0,124,2002-01-01,Action,0.879541,7287,42.666667


In [31]:
movies_women = movies[movies.gender == "woman"]

In [34]:
alt.Chart(movies_women).mark_point().encode(
    x="release_year:T",
    y="proportion_of_dialogue:Q"
)

In [36]:
alt.Chart(movies_women).mark_line().encode(
    x="release_year:T",
    y=alt.Y("mean(proportion_of_dialogue):Q")#.scale(domain=(1980, 2015))
)