In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz
!tar xf spark-3.0.3-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop3.2"

In [4]:
!pip install pyspark==3.0.2

Collecting pyspark==3.0.2
  Downloading pyspark-3.0.2.tar.gz (204.8 MB)
[K     |████████████████████████████████| 204.8 MB 58 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 54.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186690 sha256=afcffdd213021d1f8b9e84a2dbe484bdefcbd017299c2779b0026251f3349ee8
  Stored in directory: /root/.cache/pip/wheels/9a/39/f6/970565f38054a830e9a8593f388b36e14d75dba6c6fdafc1ec
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2


In [129]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
import re

spark = SparkSession.builder.master("local[*]").appName("Big_Data_Application_ICP_2").getOrCreate()
sc = spark.sparkContext
sqlContext = HiveContext(sc)

In [144]:
from google.colab import files
files.upload()

Saving icp2.txt to icp2.txt


{'icp2.txt': b"It's not only writers who can benefit from this free online tool. If you're a programmer who's working on a project where blocks of text are needed, this tool can be a great way to get that. It's a good way to test your programming and that the tool being created is working well.\r\n\r\nAbove are a few examples of how the random paragraph generator can be beneficial. The best way to see if this random paragraph picker will be useful for your intended purposes is to give it a try. Generate a number of paragraphs to see if they are beneficial to your current project.\r\n\r\nIf you do find this paragraph tool useful, please do us a favor and let us know how you're using it. It's greatly beneficial for us to know the different ways this tool is being used so we can improve it with updates. This is especially true since there are times when the generators we create get used in completely unanticipated ways from when we initially created them. If you have the time, please send

In [173]:
with open('icp.txt') as f:
    lines = f.read().splitlines()
lines = ''.join(lines)
s = re.sub('[^0-9a-zA-Z]+', ' ', lines)
s = re.sub(' +', ',', s)
my_list = s.split(',')
my_list = [word.capitalize() for word in my_list]
lines

'As the Labor Day holiday nears, many people are planning travel and get-togethers to see family and friends.Unfortunately, this is occurring at the same time Covid-19 rates are climbing. The rates of new coronavirus infections are higher than they have been since January. Hospitalizations are also at their highest levels since January. In many parts of the United States, both infections and hospitalizations are higher than they were during Labor Day weekend in 2020.How should people think about Covid-19 safety now, compared to last year? Is it safe to see family and friends? What if extended family members want to stay in a house together -- what are some steps they should take to reduce risk? And how does the start of school affect our risk?To help navigate these questions, we spoke with CNN Medical Analyst Dr.Leana Wen. Wen is an emergency physician and visiting professor of health policy and management at the George Washington University Milken Institute School of Public Health. Sh

In [174]:
from pyspark.sql.types import StringType
df = sqlContext.createDataFrame(my_list, StringType())
df = df.withColumnRenamed("value","Word")
df = df.withColumn("First_Letter", df.Word.substr(0, 1))
df.show()
df.count()

+-------------+------------+
|         Word|First_Letter|
+-------------+------------+
|           As|           A|
|          The|           T|
|        Labor|           L|
|          Day|           D|
|      Holiday|           H|
|        Nears|           N|
|         Many|           M|
|       People|           P|
|          Are|           A|
|     Planning|           P|
|       Travel|           T|
|          And|           A|
|          Get|           G|
|    Togethers|           T|
|           To|           T|
|          See|           S|
|       Family|           F|
|          And|           A|
|      Friends|           F|
|Unfortunately|           U|
+-------------+------------+
only showing top 20 rows



258

In [175]:
df1 = df.filter(df.First_Letter.cast("int").isNull())   #Get rid of any numbers
df1 = df1.filter(df.First_Letter != '')                 #Get rid of 1 occurance of empty row
df1.count()

251

In [176]:
df1 = df1.select('Word', 'First_Letter').distinct()
df1.groupBy("First_Letter").count().orderBy("count", ascending=False).show(100)
df1.count()

+------------+-----+
|First_Letter|count|
+------------+-----+
|           S|   17|
|           T|   16|
|           A|   13|
|           W|   10|
|           P|   10|
|           H|   10|
|           C|    8|
|           D|    7|
|           L|    7|
|           M|    7|
|           I|    6|
|           B|    5|
|           O|    5|
|           R|    5|
|           U|    5|
|           V|    4|
|           N|    4|
|           F|    4|
|           E|    2|
|           G|    2|
|           J|    2|
|           Q|    1|
|           Y|    1|
+------------+-----+



151

In [177]:
final = df1.groupby("First_Letter").agg(F.collect_set("Word")).orderBy("First_Letter")
final.withColumnRenamed("collect_set(Word)","Words").show(26, truncate=False)

+------------+---------------------------------------------------------------------------------------------------------------+
|First_Letter|Words                                                                                                          |
+------------+---------------------------------------------------------------------------------------------------------------+
|A           |[Also, Analyst, As, At, An, Against, Angeles, And, About, Author, Are, A, Affect]                              |
|B           |[Be, Book, Been, Both, By]                                                                                     |
|C           |[Cnn, County, Covid, Compared, Climbing, Centers, Control, Coronavirus]                                        |
|D           |[Does, During, Day, Doctor, Different, Disease, Dr]                                                            |
|E           |[Emergency, Extended]                                                                            