---

# **Experiment Name:** Facebook Dataset Handling
# **Experiment No:** 05
# **Experiment Date:** 17 Sep, 2023

---



In [None]:
# Firstly installing all the tools once again like Lab 01

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar -xf spark-3.4.1-bin-hadoop3.tgz

!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()


# **For Facebook Dataset:**

In [None]:
# Importing dataset to the code. The file type must be in .csv
from google.colab import files
files.upload()

TypeError: ignored

In [None]:
# For reading the dataset(.csv file) by Apache Spark. Here, "df" is not fixed. We can select any name instead of "df". For example, If we use the "bigdata" word, it will work.

df = spark.read.format('csv').option("header", "true").option("inferschema", "true").option("mode", "failfast").load("lab-5.csv")

In [None]:
df.show()

+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
| userid|age|dob_day|dob_year|dob_month|gender|tenure|friend_count|friendships_initiated|likes|likes_received|mobile_likes|mobile_likes_received|www_likes|www_likes_received|
+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|2094382| 14|     19|    1999|       11|  male|   266|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|1192601| 14|      2|    1999|       11|female|     6|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|2083884| 14|     16|    1999|       11|  male|    13|           0|                    0|    0|             0|           0|  

In [None]:
#To show 1000 rows from the dataset
df.show(1000)

+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
| userid|age|dob_day|dob_year|dob_month|gender|tenure|friend_count|friendships_initiated|likes|likes_received|mobile_likes|mobile_likes_received|www_likes|www_likes_received|
+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|2094382| 14|     19|    1999|       11|  male|   266|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|1192601| 14|      2|    1999|       11|female|     6|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|2083884| 14|     16|    1999|       11|  male|    13|           0|                    0|    0|             0|           0|  

In [None]:
df.createOrReplaceTempView('fb')

In [None]:
spark.sql("Select * from fb").show()

+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
| userid|age|dob_day|dob_year|dob_month|gender|tenure|friend_count|friendships_initiated|likes|likes_received|mobile_likes|mobile_likes_received|www_likes|www_likes_received|
+-------+---+-------+--------+---------+------+------+------------+---------------------+-----+--------------+------------+---------------------+---------+------------------+
|2094382| 14|     19|    1999|       11|  male|   266|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|1192601| 14|      2|    1999|       11|female|     6|           0|                    0|    0|             0|           0|                    0|        0|                 0|
|2083884| 14|     16|    1999|       11|  male|    13|           0|                    0|    0|             0|           0|  

In [None]:
spark.sql("Select count(*) from fb").show()

+--------+
|count(1)|
+--------+
|   99003|
+--------+



In [None]:
spark.sql("Select avg(age) from fb").show()

+-----------------+
|         avg(age)|
+-----------------+
|37.28022383160106|
+-----------------+



In [None]:
spark.sql("Select max(age) from fb").show()

+--------+
|max(age)|
+--------+
|     113|
+--------+



In [None]:
spark.sql("Select min(age) from fb").show()

+--------+
|min(age)|
+--------+
|      13|
+--------+



In [None]:
spark.sql("Select avg(age), gender from fb group by gender").show()

+------------------+------+
|          avg(age)|gender|
+------------------+------+
| 74.77714285714286|    NA|
|39.459904605753465|female|
| 35.67024618431386|  male|
+------------------+------+



In [None]:
x = spark.sql("Select avg(age) from fb").collect()[0][0]
print(x+5)

42.28022383160106


In [None]:
spark.sql("Select avg(likes_received) as avg_like, gender from fb group by gender order by avg_like").show()

+------------------+------+
|          avg_like|gender|
+------------------+------+
| 67.91154778570697|  male|
|157.38285714285715|    NA|
| 251.4354349878273|female|
+------------------+------+



In [None]:
spark.sql("Select avg(mobile_likes), avg(www_likes) from fb where age >= 20 AND age <= 40").show()

+-----------------+------------------+
|avg(mobile_likes)|    avg(www_likes)|
+-----------------+------------------+
|97.57308600187058|31.245178817975326|
+-----------------+------------------+

