In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.functions import explode
from pyspark.sql.functions import desc
from pyspark.sql.functions import monotonically_increasing_id

In [2]:
spark = SparkSession.builder.appName("WordFrequency").getOrCreate()
df_count = spark.read.text("Marvel+Graph.txt")
df_count.show()

+--------------------+
|               value|
+--------------------+
|5988 748 1722 375...|
|5989 4080 4264 44...|
|5982 217 595 1194...|
|5983 1165 3836 43...|
|5980 2731 3712 15...|
|5981 3569 5353 40...|
|5986 2658 3712 26...|
|5987 2614 5716 17...|
|5984 590 4898 745...|
|5985 3233 2254 21...|
|6294 4898 1127 32...|
|270 2658 3003 380...|
|271 4935 5716 430...|
|272 2717 4363 408...|
|273 1165 5013 511...|
|274 3920 5310 402...|
|275 4366 3373 158...|
|276 2277 5251 480...|
|277 1068 3495 619...|
|278 1145 667 2650...|
+--------------------+
only showing top 20 rows



In [3]:
df_nomes = spark.read.text("Marvel+Names.txt")
df_nomes.show()

+--------------------+
|               value|
+--------------------+
|1 "24-HOUR MAN/EM...|
|2 "3-D MAN/CHARLE...|
|3 "4-D MAN/MERCURIO"|
|         4 "8-BALL/"|
|               5 "A"|
|           6 "A'YIN"|
|    7 "ABBOTT, JACK"|
|         8 "ABCISSA"|
|            9 "ABEL"|
|10 "ABOMINATION/E...|
|11 "ABOMINATION |...|
|    12 "ABOMINATRIX"|
|        13 "ABRAXAS"|
|     14 "ADAM 3,031"|
|        15 "ABSALOM"|
|16 "ABSORBING MAN...|
|17 "ABSORBING MAN...|
|           18 "ACBA"|
|19 "ACHEBE, REVER...|
|       20 "ACHILLES"|
+--------------------+
only showing top 20 rows



In [4]:
words = df_count.select(explode(split(df_count.value, " ")).alias("word"))

In [6]:
filtered_words = words.filter(words.word != '')
word_freq = filtered_words.groupBy("word").count()
word_freq = word_freq.sort(desc("count"))
word_freq.show()

+----+-----+
|word|count|
+----+-----+
| 859| 1937|
|5306| 1745|
|2664| 1532|
|5716| 1429|
|6306| 1397|
|3805| 1389|
|2557| 1374|
|4898| 1348|
|5736| 1292|
| 403| 1283|
|6066| 1266|
|2650| 1247|
|2399| 1179|
|1289| 1107|
|5467| 1098|
| 133| 1097|
|6148| 1096|
| 154| 1095|
|5046| 1083|
|1602| 1082|
+----+-----+
only showing top 20 rows



In [16]:
first_row = list(word_freq.first())
last_row = list(word_freq.collect()[-1])

first_row

['859', 1937]

In [12]:
df_split = df_nomes.withColumn("ID", split(df_nomes["value"], ' "')[0].cast("int")).withColumn("Name", split(df_nomes["value"], ' "')[1])
df_split.show()

+--------------------+---+--------------------+
|               value| ID|                Name|
+--------------------+---+--------------------+
|1 "24-HOUR MAN/EM...|  1|24-HOUR MAN/EMMAN...|
|2 "3-D MAN/CHARLE...|  2|3-D MAN/CHARLES C...|
|3 "4-D MAN/MERCURIO"|  3|   4-D MAN/MERCURIO"|
|         4 "8-BALL/"|  4|            8-BALL/"|
|               5 "A"|  5|                  A"|
|           6 "A'YIN"|  6|              A'YIN"|
|    7 "ABBOTT, JACK"|  7|       ABBOTT, JACK"|
|         8 "ABCISSA"|  8|            ABCISSA"|
|            9 "ABEL"|  9|               ABEL"|
|10 "ABOMINATION/E...| 10|ABOMINATION/EMIL ...|
|11 "ABOMINATION |...| 11|ABOMINATION | MUT...|
|    12 "ABOMINATRIX"| 12|        ABOMINATRIX"|
|        13 "ABRAXAS"| 13|            ABRAXAS"|
|     14 "ADAM 3,031"| 14|         ADAM 3,031"|
|        15 "ABSALOM"| 15|            ABSALOM"|
|16 "ABSORBING MAN...| 16|ABSORBING MAN/CAR...|
|17 "ABSORBING MAN...| 17|ABSORBING MAN | M...|
|           18 "ACBA"| 18|              

In [14]:
mais_popular = df_split.filter(df_split.ID == first_row[0])
mais_popular.show()

+--------------------+---+----------------+
|               value| ID|            Name|
+--------------------+---+----------------+
|859 "CAPTAIN AMER...|859|CAPTAIN AMERICA"|
+--------------------+---+----------------+



In [17]:
menos_popular = df_split.filter(df_split.ID == last_row[0])
menos_popular.show()

+--------------------+----+--------------------+
|               value|  ID|                Name|
+--------------------+----+--------------------+
|2117 "GERVASE, LA...|2117|GERVASE, LADY ALY...|
+--------------------+----+--------------------+

