In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, regexp_replace, year, count, row_number, lower, to_timestamp
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType
import xml.etree.ElementTree as ET
from google.colab import files

In [23]:
# создание сессии Spark
spark = SparkSession.builder \
    .appName("lab2") \
    .getOrCreate()

In [25]:
xml_file = '/content/drive/MyDrive/Colab Notebooks/posts_sample.xml'
tree = ET.parse(xml_file)
root = tree.getroot()
data = []

# парсинг
for row in root.findall('row'):
    # парсинг полей
    creation_date = row.attrib.get('CreationDate')
    tags = row.attrib.get('Tags')

    if (creation_date and tags) is not None:
        tags_clean = tags.replace('<', '').replace('>', ' ').strip().split()
        for tag in tags_clean:
            data.append((creation_date, tag))

In [33]:
# структура данных для схемы DataFrame
schema = StructType([
    StructField("CreationDate", StringType(), True),
    StructField("Tag", StringType(), True)])

posts_df = spark.createDataFrame(data, schema=schema)

# извлечение года из временных данных
posts_df = posts_df.withColumn("Year", year(to_timestamp(col("CreationDate"))))
posts_df = posts_df.drop("CreationDate")

posts_df.show()

+-------------------+----+
|                Tag|Year|
+-------------------+----+
|                 c#|2008|
|     floating-point|2008|
|    type-conversion|2008|
|             double|2008|
|            decimal|2008|
|               html|2008|
|                css|2008|
|internet-explorer-7|2008|
|                 c#|2008|
|               .net|2008|
|           datetime|2008|
|                 c#|2008|
|           datetime|2008|
|               time|2008|
|           datediff|2008|
| relative-time-span|2008|
|               html|2008|
|            browser|2008|
|           timezone|2008|
|         user-agent|2008|
+-------------------+----+
only showing top 20 rows



In [None]:
csv_file = '/content/drive/MyDrive/Colab Notebooks/programming-languages.csv'
languages_df = spark.read.csv(csv_file, header=True)

# приведение тегов к нижнему регистру
languages_df = languages_df.withColumn("name", lower(col("name")))

# inner join по тегам и названиям языков
joined_df = posts_df.join(languages_df, posts_df.Tag == languages_df.name,
                            "inner")

filtered_df.show()


+-----------+----+-----------+--------------------+
|        Tag|Year|       name|       wikipedia_url|
+-----------+----+-----------+--------------------+
|       java|2010|       java|https://en.wikipe...|
|        php|2010|        php|https://en.wikipe...|
|       ruby|2010|       ruby|https://en.wikipe...|
|          c|2010|          c|https://en.wikipe...|
|        php|2010|        php|https://en.wikipe...|
|     python|2010|     python|https://en.wikipe...|
| javascript|2010| javascript|https://en.wikipe...|
|applescript|2010|applescript|https://en.wikipe...|
|        php|2010|        php|https://en.wikipe...|
|        php|2010|        php|https://en.wikipe...|
| javascript|2010| javascript|https://en.wikipe...|
|        sed|2010|        sed|https://en.wikipe...|
|     python|2010|     python|https://en.wikipe...|
|       java|2010|       java|https://en.wikipe...|
|       ruby|2010|       ruby|https://en.wikipe...|
|objective-c|2010|objective-c|https://en.wikipe...|
| javascript

In [49]:
# группировка данных по Year и Tag с подсчетом количества упоминаний
tag_counts = filtered_df.groupBy("Year", "Tag").agg(count("*").alias("Count"))

# определение окна для ранжирования
window_spec = Window.partitionBy("Year").orderBy(col("Count").desc())

# топ-10 языков по количеству упоминаний, сгруппированные по годам
top10_df = tag_counts.withColumn("Rank", row_number().over(window_spec)) \
                     .filter(col("Rank") <= 10) \
                     .orderBy("Year", "Rank")

top10_df.show()

+----+-----------+-----+----+
|Year|        Tag|Count|Rank|
+----+-----------+-----+----+
|2008|       java|    5|   1|
|2008|       ruby|    4|   2|
|2008|          c|    2|   3|
|2008| javascript|    2|   4|
|2008|        x++|    1|   5|
|2008|     python|    1|   6|
|2008|         io|    1|   7|
|2008|     groovy|    1|   8|
|2008|        php|    1|   9|
|2009|       java|   28|   1|
|2009|     python|   23|   2|
|2009|        php|   22|   3|
|2009| javascript|   12|   4|
|2009|       ruby|    8|   5|
|2009|     delphi|    7|   6|
|2009|          c|    6|   7|
|2009|objective-c|    6|   8|
|2009|    haskell|    4|   9|
|2009|       bash|    3|  10|
|2010|       java|   52|   1|
+----+-----------+-----+----+
only showing top 20 rows



In [50]:
# сохранение DataFrame в Parquet
top10_df.write.mode("overwrite").parquet("top_languages.parquet")
! zip -r top_languages.zip top_languages.parquet
files.download("top_languages.zip")

  adding: top_languages.parquet/ (stored 0%)
  adding: top_languages.parquet/._SUCCESS.crc (stored 0%)
  adding: top_languages.parquet/part-00000-f0d189e4-a272-43d4-a7cd-2b35e9f89445-c000.snappy.parquet (deflated 36%)
  adding: top_languages.parquet/.part-00000-f0d189e4-a272-43d4-a7cd-2b35e9f89445-c000.snappy.parquet.crc (stored 0%)
  adding: top_languages.parquet/_SUCCESS (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>