In [11]:
import datetime
import sys

from pyspark.sql import functions as f
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField

In [2]:
spark = (SparkSession.builder
         .appName('ikhlebushkin_nosql_task1')
         .master('yarn')
         .config('spark.sql.catalog.cassandra', 'com.datastax.spark.connector.datasource.CassandraCatalog')
         .getOrCreate())

In [3]:
pubg_kills = spark.read.format('csv').option('header', 'true').load('/data/pubg')
pubg_kills = pubg_kills.fillna(0)

                                                                                

In [4]:
x_diff = pubg_kills.killer_position_x - pubg_kills.victim_position_x
y_diff = pubg_kills.killer_position_y - pubg_kills.victim_position_y
pubg_kills = pubg_kills.withColumn('distance', f.sqrt(f.pow(x_diff, 2) + f.pow(y_diff, 2)))

In [5]:
pubg_kills = pubg_kills.withColumn('placement_diff', pubg_kills.victim_placement - pubg_kills.killer_placement)

In [6]:
pubg_kills.show()

[Stage 1:>                                                          (0 + 1) / 1]

+------------+----------------+----------------+-----------------+-----------------+-------+--------------------+----+---------------+----------------+-----------------+-----------------+------------------+--------------+
|   killed_by|     killer_name|killer_placement|killer_position_x|killer_position_y|    map|            match_id|time|    victim_name|victim_placement|victim_position_x|victim_position_y|          distance|placement_diff|
+------------+----------------+----------------+-----------------+-----------------+-------+--------------------+----+---------------+----------------+-----------------+-----------------+------------------+--------------+
|     Grenade| KrazyPortuguese|             5.0|         657725.1|         146275.2|MIRAMAR|2U4GBNA0YmnLSqvEy...| 823|KrazyPortuguese|             5.0|         657725.1|         146275.2|               0.0|           0.0|
|      SCAR-L|nide2Bxiaojiejie|            31.0|         93091.37|         722236.4|MIRAMAR|2U4GBNA0YmnLSqvEy...

                                                                                

In [6]:
now = datetime.datetime.now()
write_timestamp = f'{now.year}-{now.month}-{now.day}'

In [36]:
avg_dist = pubg_kills.groupby(['killed_by', 'time']).agg(f.avg('distance').alias('distance'))
avg_dist = avg_dist.withColumn('write_timestamp', f.lit(write_timestamp))
(avg_dist.write.format('org.apache.spark.sql.cassandra')
               .mode('append')
               .options(table='dist_avg', keyspace='miptstudent2024_07')
               .save())

                                                                                

In [42]:
max_dist = pubg_kills.groupby(['killed_by', 'time']).agg(f.max('distance').alias('distance'))
max_dist = max_dist.withColumn('write_timestamp', f.lit(write_timestamp))
(max_dist.write.format('org.apache.spark.sql.cassandra')
               .mode('append')
               .options(table='dist_max', keyspace='miptstudent2024_07')
               .save())

                                                                                

In [45]:
placement_diff = pubg_kills.groupby(['killed_by', 'time']).agg(f.avg('placement_diff').alias('placement_diff'))
placement_diff = placement_diff.withColumn('write_timestamp', f.lit(write_timestamp))
(placement_diff.write.format('org.apache.spark.sql.cassandra')
               .mode('append')
               .options(table='placement_diff_avg', keyspace='miptstudent2024_07')
               .save())

                                                                                

In [29]:
old_map_kills = spark.sql('SELECT * FROM cassandra.miptstudent2024_07.map_kills_all')
map_kills_new = pubg_kills.groupby(['map', 'killed_by', 'time']).agg(f.count('victim_name').alias('kills_count'))
map_kills_new = map_kills_new.fillna({'map': 'unknown_map'})

In [59]:
map_kills_joined = old_map_kills.withColumnRenamed('kills_count', 'kills_count_left').join(map_kills_new, on=['killed_by', 'map', 'time'])
map_kills_joined = map_kills_joined.fillna({'kills_count': 0})
map_kills_joined = map_kills_joined.withColumn(
    'kills_count', map_kills_joined['kills_count_left'] + map_kills_joined['kills_count']
).drop('kills_count_left')
(map_kills_joined.write
                 .format('org.apache.spark.sql.cassandra')
                 .mode('append')
                 .options(table='map_kills_all', keyspace='miptstudent2024_07')
                 .save())

                                                                                

In [64]:
df = spark.sql("SELECT killed_by, map, SUM(kills_count) FROM cassandra.miptstudent2024_07.map_kills_all WHERE killed_by = 'Mini 14' AND map = 'ERANGEL' AND time >= 10 AND time <= 1000 GROUP BY killed_by, map;")

In [65]:
df.show()

[Stage 27:>                                                         (0 + 1) / 1]

+---------+-------+----------------+
|killed_by|    map|sum(kills_count)|
+---------+-------+----------------+
|  Mini 14|ERANGEL|          733486|
+---------+-------+----------------+



                                                                                

In [13]:
spark.stop()