## Cargo el dataframe de actores

In [1]:
df_actors = \
    spark.read.load('imdb_actors_key_noheader.tsv', 
                    format="csv", delimiter='\t', header=True)

In [2]:
df_actors.count()

17577

In [3]:
df_actors.first()

Row(id=u'15629', name=u'Rudder, Michael (I)', movie_count=u'12', category=u'Thriller', detail=u'Action:1,Comedy:1,Drama:1,Fantasy:1,Horror:1,NULL:2,Romance:1,Sci-Fi:1,Thriller:2,War:1')

## Cargo el dataframe de aristas

In [4]:
df_edges = \
    spark.read.load('imdb_actor_edges.tsv', 
                    format="csv", delimiter='\t', header=True)

In [5]:
df_edges.count()

287074

In [6]:
df_edges.first()

Row(src=u'17776', dst=u'17778', times_acted_together=u'6')

## Creo el grafo

In [7]:
from graphframes import *

In [8]:
g = GraphFrame(df_actors, df_edges)

In [9]:
g.vertices.count()

17577

In [10]:
g.edges.count()

287074

## Busco a nuestra estrella

In [11]:
bacon = g.vertices.filter("name = 'Bacon, Kevin'").first()

In [12]:
bacon

Row(id=u'3257', name=u'Bacon, Kevin', movie_count=u'43', category=u'Drama', detail=u'Adventure:1,Comedy:3,Documentary:10,Drama:8,Family:3,Music:2,Mystery:3,NULL:1,Short:9,Thriller:2,War:1')

## Actores que actuaron con Bacon

In [13]:
bacon_actors = g.edges.filter("src = 3257 or dst = 3257")

In [14]:
bacon_actors.show()

+-----+----+--------------------+
|  src| dst|times_acted_together|
+-----+----+--------------------+
| 8087|3257|                   4|
| 6895|3257|                   2|
|  708|3257|                   2|
| 8982|3257|                   2|
|10966|3257|                   2|
| 2662|3257|                   2|
| 7105|3257|                   3|
|  519|3257|                   2|
| 7762|3257|                   2|
| 8578|3257|                   3|
| 3169|3257|                   2|
| 8996|3257|                   2|
| 8999|3257|                   2|
| 9386|3257|                   2|
| 5612|3257|                   3|
| 3160|3257|                   2|
| 4248|3257|                   2|
| 3242|3257|                   4|
| 3133|3257|                   2|
| 1064|3257|                   2|
+-----+----+--------------------+
only showing top 20 rows



In [15]:
bacon_actors.count()

101

## Lo mismo pero con motif

In [16]:
motifs = g.find("(a)-[e]->(b)")\
          .filter("a.id = 3257 or b.id = 3257")
motifs.show()

+--------------------+--------------+--------------------+
|                   a|             e|                   b|
+--------------------+--------------+--------------------+
|[8087,Sedgwick, K...| [8087,3257,4]|[3257,Bacon, Kevi...|
|[6895,Matlin, Mar...| [6895,3257,2]|[3257,Bacon, Kevi...|
|[708,Murphy, Eddi...|  [708,3257,2]|[3257,Bacon, Kevi...|
|[8982,Bonham Cart...| [8982,3257,2]|[3257,Bacon, Kevi...|
|[10966,Ramis, Har...|[10966,3257,2]|[3257,Bacon, Kevi...|
|[2662,Jackson, Ja...| [2662,3257,2]|[3257,Bacon, Kevi...|
|[7105,Allen, Kare...| [7105,3257,3]|[3257,Bacon, Kevi...|
|[519,Masterson, M...|  [519,3257,2]|[3257,Bacon, Kevi...|
|[7762,Quinlan, Ka...| [7762,3257,2]|[3257,Bacon, Kevi...|
|[8578,Linney, Lau...| [8578,3257,3]|[3257,Bacon, Kevi...|
|[3169,Willis, Bru...| [3169,3257,2]|[3257,Bacon, Kevi...|
|[8996,MacLaine, S...| [8996,3257,2]|[3257,Bacon, Kevi...|
|[8999,Schiffer, C...| [8999,3257,2]|[3257,Bacon, Kevi...|
|[9386,McRobbie, P...| [9386,3257,2]|[3257,Bacon, Kevi..

In [17]:
motifs.count()

101

## Cargo las aristas invertidas

In [18]:
df_edges2 = spark.read.load('imdb_actor_edges2.tsv', format="csv", delimiter='\t', header=True)

In [19]:
df_edges2.count()

287074

## Uno los dos conjuntos de aristas

In [20]:
df_edges_all = df_edges.unionAll(df_edges2)

In [21]:
df_edges_all.count()

574148

## Y creo un nuevo grafo con estas

In [22]:
g2 = GraphFrame(df_actors, df_edges_all)

In [23]:
g2.edges.count()

574148

## Busco los actores a grado 2 de Bacon

In [24]:
motifs2 = g2.find("(a)-[e]->(b); (b)-[e2]->(c)")\
            .filter("a.id = 3257")

In [25]:
motifs2.show()

+--------------------+-------------+--------------------+--------------+--------------------+
|                   a|            e|                   b|            e2|                   c|
+--------------------+-------------+--------------------+--------------+--------------------+
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...| [9079,8964,2]|[8964,Sorvino, Mi...|
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...|  [9079,503,2]|[503,Diaz, Camero...|
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...| [9079,9049,2]|[9049,Davis, Geen...|
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...| [9079,9098,2]|[9098,Stone, Shar...|
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...|  [9079,659,2]|[659,Moore, Julia...|
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...|  [9079,447,2]|[447,Duvall, Robe...|
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...| [9079,7003,2]|[7003,Cassel, Sey...|
|[3257,Bacon, Kevi...|[3257,9079,2]|[9079,Fonda, Pete...| [9

In [26]:
motifs2.count()

11063

In [None]:
grade2_a = motifs2.select("a.id", "a.name")
grade2_c = motifs2.select("c.id", "c.name")
grade2 = grade2_a.unionAll(grade2_c)

In [28]:
grade2 = motifs2.select("c.id", "c.name")

In [29]:
grade2.count()

11063

In [30]:
grade2.first()

Row(id=u'8964', name=u'Sorvino, Mira')

In [31]:
grade2.filter("id != 3257").distinct().count()

2485

## Cuanto los triangulos

In [32]:
triangles = g2.find("(a)-[e]->(b); (b)-[e2]->(c); (c)-[e3]->(a)")
triangles.show()

+--------------------+---------------+--------------------+---------------+--------------------+---------------+
|                   a|              e|                   b|             e2|                   c|             e3|
+--------------------+---------------+--------------------+---------------+--------------------+---------------+
|[17776,Wood, T.J....|[17776,17778,6]|[17778,Black, Ric...|[17778,17777,6]|[17777,Sterne, Je...|[17777,17776,8]|
|[5578,Gale, Vince...|  [5578,9770,3]|[9770,Gray, G. Mi...|   [9770,814,2]|[814,Fletcher, Br...|   [814,5578,2]|
|[5578,Gale, Vince...|  [5578,9770,3]|[9770,Gray, G. Mi...|  [9770,9767,2]|[9767,Santiago, Z...|  [9767,5578,3]|
|[5578,Gale, Vince...|  [5578,9770,3]|[9770,Gray, G. Mi...|  [9770,4014,2]|[4014,Cubitt, Dav...|  [4014,5578,3]|
|[5578,Gale, Vince...|  [5578,9770,3]|[9770,Gray, G. Mi...| [9770,14813,2]|[14813,Armour, No...| [14813,5578,2]|
|[5578,Gale, Vince...|   [5578,929,2]|[929,Holden, Mark...|   [929,7902,3]|[7902,Bivens, J.B...|

In [33]:
triangles.count()

21283836

## Camino mínimo entre Bacon y Henriksen, Gaston Pauls y Andrea Pietra

In [34]:
paths = g2.bfs("id = 3257", "id = 8473", None, 4)

In [35]:
paths.count()

2

In [12]:
paths.show()

+--------------------+-------------+--------------------+-------------+--------------------+
|                from|           e0|                  v1|           e1|                  to|
+--------------------+-------------+--------------------+-------------+--------------------+
|[3257,Bacon, Kevi...|[3257,7884,2]|[7884,Paxton, Bil...|[7884,8473,2]|[8473,Henriksen, ...|
|[3257,Bacon, Kevi...|[3257,8468,2]|[8468,Woodruff Jr...|[8468,8473,4]|[8473,Henriksen, ...|
+--------------------+-------------+--------------------+-------------+--------------------+



In [36]:
paths2 = g2.bfs("id = 3257", "id = 4763", None, 4)

In [14]:
paths2.show()

+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+
|                from|           e0|                  v1|           e1|                  v2|           e2|                  to|
+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+
|[3257,Bacon, Kevi...|[3257,7884,2]|[7884,Paxton, Bil...|[7884,8473,2]|[8473,Henriksen, ...|[8473,4763,2]|[4763,Pauls, Gast...|
|[3257,Bacon, Kevi...|[3257,8468,2]|[8468,Woodruff Jr...|[8468,8473,4]|[8473,Henriksen, ...|[8473,4763,2]|[4763,Pauls, Gast...|
+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+



In [37]:
paths3 = g2.bfs("id = 3257", "id = 9611", None, 4)

In [16]:
paths3.show()

+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+
|                from|           e0|                  v1|           e1|                  v2|           e2|                  v3|           e3|                  to|
+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+
|[3257,Bacon, Kevi...|[3257,7884,2]|[7884,Paxton, Bil...|[7884,8473,2]|[8473,Henriksen, ...|[8473,4763,2]|[4763,Pauls, Gast...|[4763,9611,2]|[9611,Pietra, And...|
|[3257,Bacon, Kevi...|[3257,8468,2]|[8468,Woodruff Jr...|[8468,8473,4]|[8473,Henriksen, ...|[8473,4763,2]|[4763,Pauls, Gast...|[4763,9611,2]|[9611,Pietra, And...|
+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+-------------+--------------------+



## Intento encontrar el camino mínimo entre Bacon y Francella

In [None]:
paths = g2.bfs("id = 3257", "id = 5222", None, 5)

In [None]:
g2.vertices.first()

In [8]:
v2 = g2.vertices.filter("category != 'Adult'")

In [9]:
paths_no_adults = g2.find("(a)-[e]->(b)").\
filter("a.category != 'Adult' and b.category != 'Adult'")

In [12]:
e2 = paths_no_adults.select("e.src", "e.dst", "e.times_acted_together")
e2.count()

441772

In [13]:
g3 = GraphFrame(v2, e2)

In [14]:
g3.vertices.count()

15698