In [20]:

from pyspark.sql.functions import sum, when, col, lit, min, max, count, countDistinct, isnan, greatest
from pyspark.sql.functions import row_number, regexp_replace, regexp_extract, substring, concat, replace, split, explode, map_from_arrays, regexp_extract_all, coalesce
from pyspark.sql.functions import dateadd, format_number
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
import pandas as pd
from functools import reduce
from pyspark.sql import functions as F


In [21]:
spark = SparkSession \
    .builder \
    .appName("App Ad").getOrCreate()
#    .config("spark.some.config.option", "some-value") \

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [22]:
file = "data/2.csv"

df = spark.read.format("csv") \
       .option("sep", ";")\
       .option("header", "false") \
       .option("inferSchema", "true") \
       .load(file) 

column_names = []
for i in range(len(df.columns)):
    column_names.append("set" + str(i))
df = df.toDF(*column_names)

In [23]:
print(df.count())
print(len(df.columns))


100
6


In [24]:
df.show(truncate=False)

+---------------------------------+-------------------------+--------------------------+--------------------------+-------------------------+------------------------+
|set0                             |set1                     |set2                      |set3                      |set4                     |set5                    |
+---------------------------------+-------------------------+--------------------------+--------------------------+-------------------------+------------------------+
|Game 1: 1 green, 1 blue, 1 red   | 1 green, 8 red, 7 blue  | 6 blue, 10 red           | 4 red, 9 blue, 2 green   | 1 green, 3 blue         | 4 red, 1 green, 10 blue|
|Game 2: 9 red, 7 green, 3 blue   | 15 green, 2 blue, 5 red | 10 red, 3 blue, 13 green |NULL                      |NULL                     |NULL                    |
|Game 3: 3 red, 1 blue, 4 green   | 6 red, 3 green, 2 blue  | 6 red, 16 blue, 1 green  |NULL                      |NULL                     |NULL                    

In [25]:
df = df.withColumn("game_no", split(df["set0"], ":")[0])
df = df.withColumn("set0", split(df["set0"], ":")[1])
df = df.withColumn('game_no',regexp_replace('game_no','Game ',"").cast(DoubleType()))

df.show(truncate=False)

+-------------------------+-------------------------+--------------------------+--------------------------+-------------------------+------------------------+-------+
|set0                     |set1                     |set2                      |set3                      |set4                     |set5                    |game_no|
+-------------------------+-------------------------+--------------------------+--------------------------+-------------------------+------------------------+-------+
| 1 green, 1 blue, 1 red  | 1 green, 8 red, 7 blue  | 6 blue, 10 red           | 4 red, 9 blue, 2 green   | 1 green, 3 blue         | 4 red, 1 green, 10 blue|1.0    |
| 9 red, 7 green, 3 blue  | 15 green, 2 blue, 5 red | 10 red, 3 blue, 13 green |NULL                      |NULL                     |NULL                    |2.0    |
| 3 red, 1 blue, 4 green  | 6 red, 3 green, 2 blue  | 6 red, 16 blue, 1 green  |NULL                      |NULL                     |NULL                    |3.0    

In [26]:
for i in column_names:
    df = df.withColumn(i + '_array', map_from_arrays(regexp_extract_all(i, lit(r"(\d+)\s(\w+)"), 2), regexp_extract_all(i, lit(r"(\d+)\s(\w+)"), 1)))

# df = df.withColumn("set0_map", map_from_arrays(regexp_extract_all("set0", lit(r"(\d+)\s(\w+)"), 2), regexp_extract_all("set0", lit(r"(\d+)\s(\w+)"), 1)))\
#     .withColumn("set1_map", map_from_arrays(regexp_extract_all("set1", lit(r"(\d+)\s(\w+)"), 2), regexp_extract_all("set1", lit(r"(\d+)\s(\w+)"), 1)))\
#     .withColumn("set2_map", map_from_arrays(regexp_extract_all("set2", lit(r"(\d+)\s(\w+)"), 2), regexp_extract_all("set2", lit(r"(\d+)\s(\w+)"), 1)))\
#     .withColumn("set3_map", map_from_arrays(regexp_extract_all("set3", lit(r"(\d+)\s(\w+)"), 2), regexp_extract_all("set3", lit(r"(\d+)\s(\w+)"), 1)))\
#     .withColumn("set4_map", map_from_arrays(regexp_extract_all("set4", lit(r"(\d+)\s(\w+)"), 2), regexp_extract_all("set4", lit(r"(\d+)\s(\w+)"), 1)))\
#     .withColumn("set5_map", map_from_arrays(regexp_extract_all("set5", lit(r"(\d+)\s(\w+)"), 2), regexp_extract_all("set5", lit(r"(\d+)\s(\w+)"), 1)))

df.show(truncate=False)

+-------------------------+-------------------------+--------------------------+--------------------------+-------------------------+------------------------+-------+-----------------------------------+-----------------------------------+------------------------------------+------------------------------------+-----------------------------------+----------------------------------+
|set0                     |set1                     |set2                      |set3                      |set4                     |set5                    |game_no|set0_array                         |set1_array                         |set2_array                          |set3_array                          |set4_array                         |set5_array                        |
+-------------------------+-------------------------+--------------------------+--------------------------+-------------------------+------------------------+-------+-----------------------------------+------------------------------

In [27]:
for i in column_names:
    df = df.withColumn(i + '_pass', when((((col(i + '_array')['red'] <= 12)|(col(i + '_array')['red']).isNull())\
                                           & ((col(i + '_array')['green'] <= 13)|(col(i + '_array')['green']).isNull()) \
                                            & ((col(i + '_array')['blue'] <= 14)|(col(i + '_array')['blue']).isNull())), True).otherwise(False))

# df = df.withColumn('set0_pass', when(((col('set0_map')['red'] <= 12) & (col('set0_map')['green'] <= 13) & (col('set0_map')['blue'] <= 14)), True).otherwise(False))\
#     .withColumn('set1_pass', when(((col('set1_map')['red'] <= 12) & (col('set1_map')['green'] <= 13) & (col('set1_map')['blue'] <= 14)), True).otherwise(False))\
#     .withColumn('set2_pass', when(((col('set2_map')['red'] <= 12) & (col('set2_map')['green'] <= 13) & (col('set2_map')['blue'] <= 14)), True).otherwise(False))\
#     .withColumn('set3_pass', when(((col('set3_map')['red'] <= 12) & (col('set3_map')['green'] <= 13) & (col('set3_map')['blue'] <= 14)), True).otherwise(False))\
#     .withColumn('set4_pass', when(((col('set4_map')['red'] <= 12) & (col('set4_map')['green'] <= 13) & (col('set4_map')['blue'] <= 14)), True).otherwise(False))\
#     .withColumn('set5_pass', when(((col('set5_map')['red'] <= 12) & (col('set5_map')['green'] <= 13) & (col('set5_map')['blue'] <= 14)), True).otherwise(False))

df = df.withColumn('all_pass', when(reduce(lambda x, y: x & y, [col(c + '_pass') == True for c in column_names]), lit(1)).otherwise(lit(0)))

In [28]:
df.show(truncate=False)

+-------------------------+-------------------------+--------------------------+--------------------------+-------------------------+------------------------+-------+-----------------------------------+-----------------------------------+------------------------------------+------------------------------------+-----------------------------------+----------------------------------+---------+---------+---------+---------+---------+---------+--------+
|set0                     |set1                     |set2                      |set3                      |set4                     |set5                    |game_no|set0_array                         |set1_array                         |set2_array                          |set3_array                          |set4_array                         |set5_array                        |set0_pass|set1_pass|set2_pass|set3_pass|set4_pass|set5_pass|all_pass|
+-------------------------+-------------------------+--------------------------+--------------

In [29]:
df.groupBy('all_pass').agg(sum(col('game_no'))).show()

+--------+------------+
|all_pass|sum(game_no)|
+--------+------------+
|       1|      3035.0|
|       0|      2015.0|
+--------+------------+



#### Part 2

In [30]:
colors = ['red', 'blue','green']
for color in colors:
    df = df.withColumn(color + '_max', 
                       greatest(*[col(c + '_array')[color].cast(DoubleType()) 
                                  for c in column_names]))
    
df = df.fillna(0)
df = df.withColumn('power', col('red_max') * col('blue_max') * col('green_max'))

In [31]:
print(df.agg(sum(col('power'))))

+----------+
|sum(power)|
+----------+
|   66027.0|
+----------+



In [32]:
powerSum = 0
sum = 0
with open('data/2.csv') as input:
	for line in input:
		game = line.strip().split(':')
		moves = game[1].split(';')
		gameID = int(game[0].strip().split(' ')[1])
		minRed = 0
		minGreen = 0
		minBlue = 0
		for move in moves:
			pieces = move.strip().split(',')
			for piece in pieces:
				count = int(piece.strip().split(' ')[0])
				color = piece.strip().split(' ')[1]
				if color == 'red' and count > minRed:
					minRed = count
				if color == 'blue' and count > minBlue:
					minBlue = count
				if color == 'green' and count > minGreen:
					minGreen = count	
		if minGreen <= 13 and minBlue <= 14 and minRed <= 12:
			sum += gameID
		power = minRed * minGreen * minBlue
		powerSum += power

print(f'Part 1 = {sum}')
print(f'Part 2 = {powerSum}')

Part 1 = 3035
Part 2 = 66027
