In [None]:
import pandas as pd

!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version
!pip install pyspark


In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession 
spark = SparkSession.builder\
            .appName("Spark Dataframe Intro")\
            .getOrCreate()
tags = spark.read.option("header","true").option("inferSchema","false").csv('/content/tags.csv')
ratings = spark.read.option("header","true").option("inferSchema","true").csv('/content/ratings.csv')

In [None]:
tags.show()

+------+-------+-----------------+----------+
|userId|movieId|              tag| timestamp|
+------+-------+-----------------+----------+
|     2|  60756|            funny|1445714994|
|     2|  60756|  Highly quotable|1445714996|
|     2|  60756|     will ferrell|1445714992|
|     2|  89774|     Boxing story|1445715207|
|     2|  89774|              MMA|1445715200|
|     2|  89774|        Tom Hardy|1445715205|
|     2| 106782|            drugs|1445715054|
|     2| 106782|Leonardo DiCaprio|1445715051|
|     2| 106782|  Martin Scorsese|1445715056|
|     7|  48516|     way too long|1169687325|
|    18|    431|        Al Pacino|1462138765|
|    18|    431|         gangster|1462138749|
|    18|    431|            mafia|1462138755|
|    18|   1221|        Al Pacino|1461699306|
|    18|   1221|            Mafia|1461699303|
|    18|   5995|        holocaust|1455735472|
|    18|   5995|       true story|1455735479|
|    18|  44665|     twist ending|1456948283|
|    18|  52604|  Anthony Hopkins|

In [None]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [None]:
joinedDF = tags.join(ratings,'movieId','inner')
joinedDF.show(50)

+-------+------+--------------------+----------+------+------+---------+
|movieId|userId|                 tag| timestamp|userId|rating|timestamp|
+-------+------+--------------------+----------+------+------+---------+
|      1|   567|                 fun|1525286013|     1|   4.0|964982703|
|      1|   474|               pixar|1137206825|     1|   4.0|964982703|
|      1|   336|               pixar|1139045764|     1|   4.0|964982703|
|      3|   289|                 old|1143424860|     1|   4.0|964981247|
|      3|   289|               moldy|1143424860|     1|   4.0|964981247|
|     47|   474|       serial killer|1137206452|     1|   5.0|964983815|
|     47|   424|        twist ending|1457842458|     1|   5.0|964983815|
|     47|   424|             mystery|1457842470|     1|   5.0|964983815|
|     50|   474|               heist|1137206826|     1|   5.0|964982931|
|     50|   424|        twist ending|1457842306|     1|   5.0|964982931|
|     50|   424|              tricky|1457842340|   

In [None]:
joinedDF.createOrReplaceTempView('joinedDF')
toplamlar = spark.sql('select tag , sum(rating) as toplam from joinedDF group by tag')

In [None]:
toplamlar.show()

+--------------------+------+
|                 tag|toplam|
+--------------------+------+
|              ransom| 416.0|
|              freaks| 364.5|
|wrongful imprison...|1404.0|
|        Heartwarming| 365.0|
|               anime|1369.0|
|  intelligent sci-fi| 245.5|
|               1970s| 174.5|
|                 art|  73.5|
|             lyrical|  89.5|
|                hope|  61.5|
|          creativity|   5.0|
|       John Travolta|1288.5|
|intertwining stor...|1288.5|
|        conversation|1288.5|
|              sequel|1763.5|
|               macho| 218.5|
|          Emma Stone| 205.5|
|           Wolverine| 283.0|
|               mafia| 452.5|
|          television| 241.0|
+--------------------+------+
only showing top 20 rows



In [None]:
joinedDF2 = joinedDF.groupBy('tag').count()
joinedDF2=  joinedDF2.sort("count",ascending=False)

In [None]:
joinedDF3 = joinedDF2.join(toplamlar,'tag','inner')

In [None]:
joinedDF3.show(200)

+--------------------+-----+------+
|                 tag|count|toplam|
+--------------------+-----+------+
|               1970s|   46| 174.5|
|        Heartwarming|   88| 365.0|
|               anime|  342|1369.0|
|                 art|   20|  73.5|
|          creativity|    1|   5.0|
|              freaks|   97| 364.5|
|                hope|   18|  61.5|
|  intelligent sci-fi|   65| 245.5|
|             lyrical|   25|  89.5|
|              ransom|  106| 416.0|
|wrongful imprison...|  317|1404.0|
|          Emma Stone|   53| 205.5|
|       John Travolta|  307|1288.5|
|           Wolverine|   76| 283.0|
|        conversation|  307|1288.5|
|intertwining stor...|  307|1288.5|
|               macho|   61| 218.5|
|               mafia|  124| 452.5|
|               scifi|   18|  54.0|
|              sequel|  478|1763.5|
|          television|   67| 241.0|
|                70mm|    2|   4.5|
|              Russia|    6|  24.0|
|                 cia|   28| 111.5|
|                lies|  115|

In [None]:
joinedDF3.createOrReplaceTempView('joinedDF3')

In [None]:
joinedDF3 = spark.sql(' select * , toplam/count as average_rating from joinedDF3')

In [None]:
joinedDF3.show()

+--------------------+-----+------+------------------+
|                 tag|count|toplam|    average_rating|
+--------------------+-----+------+------------------+
|               1970s|   46| 174.5|3.7934782608695654|
|        Heartwarming|   88| 365.0|4.1477272727272725|
|               anime|  342|1369.0| 4.002923976608187|
|                 art|   20|  73.5|             3.675|
|          creativity|    1|   5.0|               5.0|
|              freaks|   97| 364.5|3.7577319587628866|
|                hope|   18|  61.5|3.4166666666666665|
|  intelligent sci-fi|   65| 245.5| 3.776923076923077|
|             lyrical|   25|  89.5|              3.58|
|              ransom|  106| 416.0|3.9245283018867925|
|wrongful imprison...|  317|1404.0| 4.429022082018927|
|          Emma Stone|   53| 205.5|3.8773584905660377|
|       John Travolta|  307|1288.5| 4.197068403908795|
|           Wolverine|   76| 283.0| 3.723684210526316|
|        conversation|  307|1288.5| 4.197068403908795|
|intertwin