In [1]:
!pip3 install pyspark==3.0.0

Collecting pyspark==3.0.0
  Downloading pyspark-3.0.0.tar.gz (204.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.7/204.7 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9 (from pyspark==3.0.0)
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.0-py2.py3-none-any.whl size=205044159 sha256=f1c4655b585c9257f65bd563cba6bc3fccedca8ab64630df60146a3153bd8078
  Stored in directory: /root/.cache/pip/wheels/b1/bb/8b/ca24d3f756f2ed967225b0871898869db676eb5846df5adc56
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0

In [2]:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql as sql
import xml.etree.ElementTree as ET
from pyspark.sql.functions import explode
from pyspark.sql import Window
from pyspark.sql.functions import row_number

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.14.0 pyspark-shell'

In [4]:
try:
    sc = SparkContext.getOrCreate()
    sc.setLogLevel("ERROR")
except:
    conf = SparkConf().setAppName("Lab2").setMaster('local[1]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
spark = SparkSession(sc)

sc

In [5]:
import requests

with open("posts_sample.xml", "wb") as f:
    request = requests.get("https://git.ai.ssau.ru/tk/big_data/raw/branch/bachelor/data/posts_sample.xml")
    f.write(request.content)

with open("programming-languages.csv", "wb") as f:
    request = requests.get("https://git.ai.ssau.ru/tk/big_data/raw/branch/bachelor/data/programming-languages.csv")
    f.write(request.content)

In [6]:
postsSample = spark.read.format('xml')\
.option('rootTag', 'posts')\
.option('rowTag', 'row')\
.load("posts_sample.xml")

In [7]:
languages = spark.read\
.option("header", True)\
.option("inferSchema", True)\
.csv("programming-languages.csv")

languages

DataFrame[name: string, wikipedia_url: string]

In [8]:
postsSample_DF = postsSample.select("_CreationDate", "_ViewCount", "_Tags")

In [9]:
parsedPosts_sample = (
    postsSample_DF
    .filter(col("_Tags").isNotNull())
    .rdd
    .map(lambda x: (str(x["_CreationDate"]).split('-')[0], x["_ViewCount"], x["_Tags"][1:-1].split('><')))
    .toDF(["year", "views", "languages"])
)

parsedPosts_sample = parsedPosts_sample.filter((col("year") >= 2010) & (col("year") <= 2020))

parsedPosts_sample.show()

+----+-----+--------------------+
|year|views|           languages|
+----+-----+--------------------+
|2010| 3650|[c++, character-e...|
|2010|  617|[sharepoint, info...|
|2010| 1315|[iphone, app-stor...|
|2010|  973|[symfony1, schema...|
|2010|  132|              [java]|
|2010|  419|[visual-studio-20...|
|2010|  869|[cakephp, file-up...|
|2010| 1303|[git, cygwin, putty]|
|2010|  748|  [drupal, drupal-6]|
|2010| 1258|[php, wordpress, ...|
|2010|14972|[c#, winforms, da...|
|2010|  274|[c#, asp.net, exc...|
|2010|  804|    [sql, xml, blob]|
|2010| 6019|[.htaccess, codei...|
|2010| 5456|[wcf, web-service...|
|2010|  316|[mod-rewrite, apa...|
|2010|15477|[sql, database, d...|
|2010| 9649|         [ruby, rvm]|
|2010|20199|  [android, eclipse]|
|2010|  735|[iphone, uiimagev...|
+----+-----+--------------------+
only showing top 20 rows



In [10]:
names = languages.select("name").rdd.map(lambda x: x[0].lower()).collect()

parsedPosts_sample = parsedPosts_sample.select("year", "views", explode("languages").alias("language"))

parsedPosts_sample = parsedPosts_sample.filter(col("language").isin(names))

parsedPosts_sample.show()

+----+-----+-----------+
|year|views|   language|
+----+-----+-----------+
|2010|  132|       java|
|2010| 1258|        php|
|2010| 9649|       ruby|
|2010| 2384|          c|
|2010| 1987|        php|
|2010| 3321|     python|
|2010|  128| javascript|
|2010|  477|applescript|
|2010| 1748|        php|
|2010|  998|        php|
|2010| 2095| javascript|
|2010|  447|        sed|
|2010| 6558|     python|
|2010|  214|       java|
|2010|  214|       ruby|
|2010|  852|objective-c|
|2010|  179| javascript|
|2010| 6709|          r|
|2010|   78|        php|
|2010| 1280| javascript|
+----+-----+-----------+
only showing top 20 rows



In [11]:
parsedPosts_sample = parsedPosts_sample.groupBy("year", "language").agg({"views": "sum"})
parsedPosts_sample.show()

+----+-----------+----------+
|year|   language|sum(views)|
+----+-----------+----------+
|2013|     erlang|      2302|
|2017| typescript|     29031|
|2017|        sed|        93|
|2013| javascript|    609571|
|2013|         f#|      4317|
|2012| powershell|     17311|
|2019|        php|      3753|
|2017|    haskell|      4040|
|2013| autohotkey|      4421|
|2013|applescript|      2059|
|2019|      xpath|        27|
|2015|     racket|       762|
|2017|         go|      1356|
|2018|      perl6|       109|
|2015|       dart|       402|
|2015|       rust|       117|
|2012|         f#|      1222|
|2018|     python|     99996|
|2017|     prolog|        76|
|2016|      latex|       461|
+----+-----------+----------+
only showing top 20 rows



In [12]:
window = Window.partitionBy(parsedPosts_sample['year']).orderBy(parsedPosts_sample['sum(views)'].desc())
postsSample_10 = parsedPosts_sample.select('*', row_number().over(window).alias('row_number')).filter(col('row_number') <= 10).withColumnRenamed("sum(views)", "views")
postsSample_10 = postsSample_10.orderBy(col("year").asc(), col("sum(views)").desc())
postsSample_10.show()

+----+-----------+-------+----------+
|year|   language|  views|row_number|
+----+-----------+-------+----------+
|2010|        php|1189629|         1|
|2010|       java| 563211|         2|
|2010| javascript| 316131|         3|
|2010|objective-c|  97009|         4|
|2010|       ruby|  76215|         5|
|2010|          c|  66587|         6|
|2010|     python|  60672|         7|
|2010|     matlab|  51865|         8|
|2010|applescript|  32305|         9|
|2010|     delphi|  13065|        10|
|2011| javascript| 809078|         1|
|2011|       java| 389834|         2|
|2011|        php| 246770|         3|
|2011|          c| 238277|         4|
|2011|objective-c| 218934|         5|
|2011|     python| 203180|         6|
|2011|       bash|  60805|         7|
|2011|       ruby|  39223|         8|
|2011|       perl|  28502|         9|
|2011|     matlab|  18816|        10|
+----+-----------+-------+----------+
only showing top 20 rows



In [13]:
postsSample_10.write.mode('overwrite').parquet("top_10_languages_between_2010_and_2020.parquet")

In [14]:
sc.stop()