In [1]:
import pyspark
import os
import sys

from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, StringType, DateType

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.13.0 pyspark-shell'
sc = SparkSession.builder.appName("lab2").master("local[1]").getOrCreate()
sc

24/03/28 18:36:43 WARN Utils: Your hostname, dmitriy-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.71.128 instead (on interface ens33)
24/03/28 18:36:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/dmitriy/.ivy2/cache
The jars for the packages stored in: /home/dmitriy/.ivy2/jars
:: loading settings :: url = jar:file:/home/dmitriy/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.databricks#spark-xml_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-713e2517-0d0b-4cc2-a51b-c2f097fd1380;1.0
	confs: [default]
	found com.databricks#spark-xml_2.12;0.13.0 in central
	found commons-io#commons-io;2.8.0 in central
	found org.glassfish.jaxb#txw2;2.3.4 in central
	found org.apache.ws.xmlschema#xmlschema-core;2.2.5 in central
:: resolution report :: resolve 126ms :: artifacts dl 3ms
	:: modules in use:
	com.dat

In [4]:
programming_languages_df = sc.read.csv("programming-languages.csv")
programming_languages_list = [str(x[0]) for x in programming_languages_df.collect()]

for index, language in enumerate(programming_languages_list[:10], start=1):
    print(f"{index}. {language}")

1. name
2. A# .NET
3. A# (Axiom)
4. A-0 System
5. A+
6. A++
7. ABAP
8. ABC
9. ABC ALGOL
10. ABSET


In [5]:
posts = sc.read.format("xml").options(rowTag="row").load('posts_sample.xml')
posts.take(1)

                                                                                

[Row(_AcceptedAnswerId=7, _AnswerCount=13, _Body="<p>I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I build the application, it gives the following error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type <code>'decimal'</code> to <code>'double'</code></p>\n</blockquote>\n\n<p>I tried using <code>trans</code> and <code>double</code> but then the control doesn't work. This code worked fine in a past VB.NET project.</p>\n", _ClosedDate=None, _CommentCount=2, _CommunityOwnedDate=datetime.datetime(2012, 11, 1, 0, 42, 47, 213000), _CreationDate=datetime.datetime(2008, 8, 1, 5, 42, 52, 667000), _FavoriteCount=48, _Id=4, _LastActivityDate=datetime.datetime(2019, 7, 19, 9, 39, 54, 173000), _LastEditDate=datetime.datetime(2019, 7, 19, 9, 39, 54, 173000), _LastEditorDisplayName='Rich B', _LastEditorUserId=3641067, _OwnerDisplayName=None, _OwnerUserId

In [6]:
def detect_language(post):
    tags_lower = post._Tags.lower()
    detected_language = next((lang for lang in programming_languages_list if f"<{lang.lower()}>" in tags_lower), None)
    return (post._Id, detected_language) if detected_language is not None else None
    
def is_post_in_year(post, year):
    creation_date = post._CreationDate
    return datetime(year, 1, 1) <= creation_date <= datetime(year, 12, 31)

def process_posts_for_year(year, posts_rdd):
    year_results = posts_rdd\
        .filter(lambda post: post._Tags is not None and is_post_in_year(post, year))\
        .map(detect_language)\
        .filter(lambda result: result is not None)\
        .map(lambda result: (result[1], 1))\
        .reduceByKey(lambda a, b: a + b)\
        .sortBy(lambda x: x[1], ascending=False)\
        .take(10)
    
    df = sc.createDataFrame(year_results, ["Programming_Language", f"Number_of_Mentions_{year}"])
    return df

final_results = {}
for year in range(2010, 2020):
    final_results[year] = process_posts_for_year(year, posts.rdd)
    final_results[year].show()


                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2010|
+--------------------+-----------------------+
|                Java|                     52|
|          JavaScript|                     44|
|                 PHP|                     42|
|              Python|                     25|
|         Objective-C|                     22|
|                   C|                     20|
|                Ruby|                     11|
|              Delphi|                      7|
|                   R|                      3|
|                Bash|                      3|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2011|
+--------------------+-----------------------+
|                 PHP|                     97|
|                Java|                     91|
|          JavaScript|                     82|
|              Python|                     35|
|         Objective-C|                     33|
|                   C|                     24|
|                Ruby|                     17|
|              Delphi|                      8|
|                Perl|                      8|
|                Bash|                      7|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2012|
+--------------------+-----------------------+
|                 PHP|                    136|
|          JavaScript|                    129|
|                Java|                    124|
|              Python|                     65|
|         Objective-C|                     45|
|                   C|                     27|
|                Ruby|                     25|
|                Bash|                      9|
|                   R|                      9|
|              MATLAB|                      6|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2013|
+--------------------+-----------------------+
|          JavaScript|                    195|
|                Java|                    191|
|                 PHP|                    173|
|              Python|                     87|
|         Objective-C|                     40|
|                   C|                     35|
|                Ruby|                     30|
|                   R|                     25|
|                Bash|                     11|
|               Scala|                     10|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2014|
+--------------------+-----------------------+
|          JavaScript|                    235|
|                Java|                    228|
|                 PHP|                    154|
|              Python|                    103|
|                   C|                     52|
|         Objective-C|                     49|
|                   R|                     28|
|                Ruby|                     20|
|              MATLAB|                     16|
|                Bash|                     13|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2015|
+--------------------+-----------------------+
|          JavaScript|                    268|
|                Java|                    208|
|                 PHP|                    147|
|              Python|                    119|
|                   R|                     43|
|                   C|                     38|
|         Objective-C|                     30|
|                Ruby|                     20|
|              MATLAB|                     16|
|               Scala|                     13|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2016|
+--------------------+-----------------------+
|          JavaScript|                    271|
|                Java|                    178|
|              Python|                    140|
|                 PHP|                    126|
|                   R|                     50|
|                   C|                     32|
|                Ruby|                     22|
|                Bash|                     16|
|               Scala|                     16|
|              MATLAB|                     15|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2017|
+--------------------+-----------------------+
|          JavaScript|                    245|
|                Java|                    205|
|              Python|                    185|
|                 PHP|                    122|
|                   R|                     53|
|                   C|                     24|
|         Objective-C|                     19|
|                Ruby|                     15|
|          TypeScript|                     14|
|          PowerShell|                     14|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2018|
+--------------------+-----------------------+
|              Python|                    214|
|          JavaScript|                    194|
|                Java|                    145|
|                 PHP|                     99|
|                   R|                     63|
|                   C|                     24|
|               Scala|                     22|
|          TypeScript|                     21|
|          PowerShell|                     13|
|                Bash|                     12|
+--------------------+-----------------------+



                                                                                

+--------------------+-----------------------+
|Programming_Language|Number_of_Mentions_2019|
+--------------------+-----------------------+
|              Python|                    162|
|          JavaScript|                    131|
|                Java|                     95|
|                 PHP|                     59|
|                   R|                     36|
|                   C|                     14|
|                  Go|                      9|
|              MATLAB|                      9|
|              Kotlin|                      9|
|                Dart|                      9|
+--------------------+-----------------------+



In [9]:
for year in final_results.keys():
    final_results[year].write.format("parquet").save(f"top_{year}")