<a href="https://colab.research.google.com/github/kiat/BigDataAnalytics/blob/master/Colab_SocialMedia_Example_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Spark installation on Colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://mirror.cc.columbia.edu/pub/software/apache/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install -q findspark
!rm -rf spark-3.0.0-bin-hadoop3.2.tgz

In [2]:
# Set JAVA_HOME and SPARK_HOME
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.0.0-bin-hadoop3.2"

import findspark
findspark.init("spark-3.0.0-bin-hadoop3.2")# SPARK_HOME


import sys
import requests
from operator import add

from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from pyspark.sql.types import *
from pyspark.sql import functions as func
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [3]:
# Download the data
! wget -q https://s3.amazonaws.com/metcs777/person_knows_person.csv
! wget -q https://s3.amazonaws.com/metcs777/person_likes_post.csv
! wget -q https://s3.amazonaws.com/metcs777/post_hasCreator_person.csv
! wget -q https://s3.amazonaws.com/metcs777/comment_hasCreator_person.csv
! wget -q https://s3.amazonaws.com/metcs777/comment_replyOf_post.csv


In [4]:
! ls -la 


total 33572
drwxr-xr-x  1 root root     4096 Jul 17 16:46 .
drwxr-xr-x  1 root root     4096 Jul 17 16:43 ..
-rw-r--r--  1 root root  7398677 Dec 12  2018 comment_hasCreator_person.csv
-rw-r--r--  1 root root  3392677 Dec 12  2018 comment_replyOf_post.csv
drwxr-xr-x  1 root root     4096 Jul 15 16:11 .config
-rw-r--r--  1 root root   232246 Dec 12  2018 person_knows_person.csv
-rw-r--r--  1 root root 21372777 Dec 12  2018 person_likes_post.csv
-rw-r--r--  1 root root  1950946 Dec 12  2018 post_hasCreator_person.csv
drwxr-xr-x  1 root root     4096 Jul 10 16:29 sample_data
drwxr-xr-x 13 1000 1000     4096 Jun  6 13:34 spark-3.0.0-bin-hadoop3.2


In [5]:
pwd

'/content'

In [6]:
# Set your file path here 
path="file:///content/"

# You have 5 files 

# comment_hasCreator_person.csv
# comment_replyOf_post.csv
# person_knows_person.csv
# person_likes_post.csv
# post_hasCreator_person.csv

fileCommentHasCreator= path+"comment_hasCreator_person.csv"
fileComment_replyOf_post = path+"comment_replyOf_post.csv"
filePerson_knows_person= path+"person_knows_person.csv"
filePerson_likes_post = path + "person_likes_post.csv"
filePost_hasCreator_person = path + "post_hasCreator_person.csv"

#Create RDDs 

In [7]:
def getRDD(mfile, sc):
		lines = sc.textFile(mfile)
		# Removing the Header Line 
		linesHeader = lines.first()
		header = sc.parallelize([linesHeader])
		linesWithOutHeader = lines.subtract(header)
		myRDD = linesWithOutHeader.map(lambda x: x.split('|'))
		return myRDD

In [8]:
	# Create RDDs from files 
	commentHasCreator = getRDD(fileCommentHasCreator, sc)
	comment_replyOf_post = getRDD(fileComment_replyOf_post, sc)
	person_knows_person = getRDD(filePerson_knows_person, sc)
	person_likes_post = getRDD(filePerson_likes_post, sc)
	post_hasCreator_person = getRDD(filePost_hasCreator_person, sc)

In [9]:
commentHasCreator.take(1)

[['30', '457']]

# Create Dataframes

In [10]:
commentHasCreatorDD = sqlContext.read.format('csv').options(header='true', inferSchema='true',  sep ="|").load(fileCommentHasCreator)
comment_replyOf_postDD = sqlContext.read.format('csv').options(header='true', inferSchema='true',  sep ="|").load(fileComment_replyOf_post)
person_knows_personDD = sqlContext.read.format('csv').options(header='true', inferSchema='true',  sep ="|").load(filePerson_knows_person)
person_likes_postDD = sqlContext.read.format('csv').options(header='true', inferSchema='true',  sep ="|").load(filePerson_likes_post)
post_hasCreator_personDD = sqlContext.read.format('csv').options(header='true', inferSchema='true',  sep ="|").load(filePost_hasCreator_person)


In [11]:
commentHasCreatorDD.show()
comment_replyOf_postDD.show()
person_knows_personDD.show()
person_likes_postDD.show()
post_hasCreator_personDD.show()


+----------+---------+
|Comment.id|Person.id|
+----------+---------+
|         0|       74|
|        10|      832|
|        20|      913|
|        30|      457|
|        40|      956|
|        50|       41|
|        60|      453|
|        70|      832|
|        80|        6|
|        90|        6|
|       100|      103|
|       110|      547|
|       120|      962|
|       130|       99|
|       140|      452|
|       150|       99|
|       160|        6|
|       170|       48|
|       180|      941|
|       190|       40|
+----------+---------+
only showing top 20 rows

+----------+-------+
|Comment.id|Post.id|
+----------+-------+
|         0|      0|
|        10|      0|
|        30|      0|
|        70|      0|
|       100|     10|
|       110|     10|
|       140|     10|
|       150|     10|
|       180|     10|
|       240|     20|
|       250|     20|
|       330|     20|
|       340|     30|
|       350|     30|
|       420|     30|
|       470|     40|
|       480|     40|
| 