In [2]:

from pyspark.sql.functions import *
from pyspark.sql.functions import row_number,lit,when, sum, countDistinct, min, max, dateadd,format_number,count,regexp_replace,isnan, regexp_extract, col,substring, concat, replace
from pyspark.sql.types import IntegerType, DoubleType, DateType
from pyspark.sql.window import Window

import pandas as pd
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession \
    .builder \
    .appName("App Ad").getOrCreate()
#    .config("spark.some.config.option", "some-value") \

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [4]:
file = "data/1.csv"

df = spark.read.format("csv") \
       .option("header", "false") \
       .option("inferSchema", "true") \
       .load(file) 

In [5]:
print(len(df.columns))
print(df.count())

1
1000


In [6]:
df.show()

+--------------------+
|                 _c0|
+--------------------+
|          xt36five77|
|     two8five6zfrtjj|
|eightthree8fiveqj...|
|       7chmvlhnpfive|
|1tcrgthmeight5mss...|
|eightoneqxspfzjk4...|
|fdbtmkhdfzrck9kxc...|
|               9six9|
| goneightczdzjk18589|
|41two3eightfscdmq...|
|               t8929|
|fourtwoxsxqqmqf3s...|
|         bcbsfd14cjg|
|95three6threendpq...|
|tdmvthreeonefive8574|
|   5eight82sixtwonev|
|      ninemg2shhmsqh|
|              thmlz4|
|xtxjmm2tbbntrmdqx...|
|vf19fourddfsvmzei...|
+--------------------+
only showing top 20 rows



In [8]:
df = df.withColumn("numbers", col("_c0"))\
    .withColumn("numbers", regexp_replace(col("numbers"), "[^0-9]", ""))\
    .withColumn("first", (substring(col('numbers'),1,1)))\
    .withColumn("last",(substring(col('numbers'),-1,1)))\
    .withColumn('calc',concat(col('first'),col('last')).cast(DoubleType()))
print(df.agg(sum('calc')))

sum(calc)
54573.0


#### Part 2

In [10]:
df = df.withColumn("numbers", col("_c0"))\
    .withColumn("numbers", regexp_replace("numbers", "one", "o1ne"))\
    .withColumn("numbers", regexp_replace("numbers", "two", "tw2o"))\
    .withColumn("numbers", regexp_replace("numbers", "three", "th3ree"))\
    .withColumn("numbers", regexp_replace("numbers", "four", "fo4ur"))\
    .withColumn("numbers", regexp_replace("numbers", "five", "fi5ve"))\
    .withColumn("numbers", regexp_replace("numbers", "six", "s6ix"))\
    .withColumn("numbers", regexp_replace("numbers", "seven", "se7ven"))\
    .withColumn("numbers", regexp_replace("numbers", "eight", "ei8ght"))\
    .withColumn("numbers", regexp_replace("numbers", "nine", "ni9ne"))\
    .withColumn("numbers", regexp_replace(col("numbers"), "[^0-9]", ""))\
    .withColumn("first", (substring(col('numbers'),1,1)))\
    .withColumn("last",(substring(col('numbers'),-1,1)))\
    .withColumn('calc',concat(col('first'),col('last')).cast(DoubleType()))
print(df.agg(sum('calc')))

+---------+
|sum(calc)|
+---------+
|  54591.0|
+---------+



#### Reference

#Restart kernel before executing

In [1]:
input = list()

with open('data/1.csv', 'r') as f:
    for line in f:
        input.append(line.rstrip())

def part1():
    values = list()

    for line in input:
        first = ""
        last = ""

        for char in line:
            if char in "0123456789":
                first = char
                break
        
        
        for char in reversed(line):
            if char in "0123456789":
                last = char
                break

        values.append(int(first + last))

    return sum(values)

def part2():
    num_words = {
        "zero": "0",
        "one": "1",
        "two": "2",
        "three": "3",
        "four": "4",
        "five": "5",
        "six": "6",
        "seven": "7",
        "eight": "8",
        "nine": "9",
    }

    values = list()

    for line in input:
        first = ""
        last = ""

        for index, char in enumerate(line):
            if char in "0123456789":
                if not first:
                    first = char
                last = char
            else:
                for key, value in num_words.items():
                    key_length = len(key)
                    if index + key_length <= len(line):
                        if line[index:index+key_length] == key:
                            if not first:
                                first = value
                            last = value
                            break

        values.append(int(first + last))
        
    return sum(values)


print(f"Part 1: {str(part1())}")
print(f"Part 2: {str(part2())}")

Part 1: 54573
Part 2: 54591
