# Super Heroes Popularity

In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql import types as data_types
from pyspark.sql.functions import col, size, split, sum, min, max

In [2]:
spark = SparkSession.builder.appName("SuperHeroPopularity").getOrCreate()

## Loading Data

In [3]:
name_schema = [
    data_types.StructField("id", data_types.StringType(), True),
    data_types.StructField("name", data_types.StringType(), True)
]
df_names = spark.read.schema( data_types.StructType(fields=name_schema)).option("sep"," ").csv("marvel-names.txt")

In [4]:
df_names.show(5)

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|24-HOUR MAN/EMMANUEL|
|  2|3-D MAN/CHARLES CHAN|
|  3|    4-D MAN/MERCURIO|
|  4|             8-BALL/|
|  5|                   A|
+---+--------------------+
only showing top 5 rows



In [5]:
graph_schema = [
    data_types.StructField("value", data_types.StringType(), True)
]
df_marvel_lines = spark.read.schema(data_types.StructType(fields=graph_schema)).text("marvel-graph.txt")

In [6]:
df_marvel_lines.show(5)

+--------------------+
|               value|
+--------------------+
|5988 748 1722 375...|
|5989 4080 4264 44...|
|5982 217 595 1194...|
|5983 1165 3836 43...|
|5980 2731 3712 15...|
+--------------------+
only showing top 5 rows



## Making Tranformations to get a Usefull Dataframe with Heroes Connections Sum

In [7]:
graph_value_column = col("value")
graph_value_spliced_column = split(graph_value_column, " ")
graph_value_spliced_column_size = size(graph_value_spliced_column)

In [13]:
connections_df = df_marvel_lines.withColumn("id", graph_value_spliced_column[0]).withColumn("connections", graph_value_spliced_column_size -1).groupBy("id").agg(sum("connections").alias("connections"))

In [14]:
connections_df.show(5)

+----+-----------+
|  id|connections|
+----+-----------+
| 691|          7|
|1159|         12|
|3959|        143|
|1572|         36|
|2294|         15|
+----+-----------+
only showing top 5 rows



## Checking the Most Popular and Unpopular Heroes

### Most Unpopular

In [17]:
least_popular_value = connections_df.agg(min("connections").alias("min")).first()
least_popular_value = least_popular_value.asDict(0)['min']
most_unpopular = connections_df.filter(connections_df["connections"] == least_popular_value)
most_unpopular = most_unpopular.join(df_names, ["id"])
most_unpopular.show()

+----+-----------+--------------------+
|  id|connections|                name|
+----+-----------+--------------------+
| 467|          1|        BERSERKER II|
| 577|          1|              BLARE/|
|3490|          1|MARVEL BOY II/MARTIN|
|3489|          1|MARVEL BOY/MARTIN BU|
|2139|          1|      GIURESCU, RADU|
+----+-----------+--------------------+
only showing top 5 rows



In [19]:
most_popular_value = connections_df.agg(max("connections").alias("max")).first()
most_popular_value = most_popular_value.asDict(0)['max']
most_popular = connections_df.filter(connections_df["connections"] == most_popular_value)
most_popular = most_popular.join(df_names, ["id"])
most_popular.show()

+---+-----------+---------------+
| id|connections|           name|
+---+-----------+---------------+
|859|       1937|CAPTAIN AMERICA|
+---+-----------+---------------+

