In [1]:
import findspark
findspark.init()

In [3]:
import pyspark
import random

In [5]:
sc = pyspark.SparkContext(appName="Pi")
num_samples = 10000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

3.1448


In [7]:
sc = pyspark.SparkContext(appName="newsc")
sc.parallelize([1,2,3])

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:489

In [8]:
sc

In [11]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [12]:
spark

In [28]:
# reading CSV with custom schema.
from pyspark.sql.types import *

customSchema = StructType([ \
    StructField("account_id", IntegerType(), True), \
    StructField("district_id", IntegerType(), True), \
    StructField("frequency", StringType(), True), \
    StructField("date", IntegerType(), True)])

In [32]:
df = spark.read.format('csv').option('header', 'true').option('delimiter', ';').load('s3://justinngbucket/Data/berka_dataset/account.asc', schema=customSchema)

In [33]:
df.show()

+----------+-----------+------------------+------+
|account_id|district_id|         frequency|  date|
+----------+-----------+------------------+------+
|       576|         55|  POPLATEK MESICNE|930101|
|      3818|         74|  POPLATEK MESICNE|930101|
|       704|         55|  POPLATEK MESICNE|930101|
|      2378|         16|  POPLATEK MESICNE|930101|
|      2632|         24|  POPLATEK MESICNE|930102|
|      1972|         77|  POPLATEK MESICNE|930102|
|      1539|          1|POPLATEK PO OBRATU|930103|
|       793|         47|  POPLATEK MESICNE|930103|
|      2484|         74|  POPLATEK MESICNE|930103|
|      1695|         76|  POPLATEK MESICNE|930103|
|      1726|         48|  POPLATEK MESICNE|930103|
|      2881|         70|  POPLATEK MESICNE|930104|
|      2357|         19|  POPLATEK MESICNE|930104|
|      2177|         62|  POPLATEK MESICNE|930104|
|       485|          6|POPLATEK PO OBRATU|930104|
|       652|         21|  POPLATEK MESICNE|930105|
|      9635|         70|  POPLA

In [34]:
df.printSchema()

root
 |-- account_id: integer (nullable = true)
 |-- district_id: integer (nullable = true)
 |-- frequency: string (nullable = true)
 |-- date: integer (nullable = true)



In [35]:
# reading csv using infer schema.
df = spark.read.format('csv').option('header', 'true').option('delimiter', ';').option('inferschema', 'true').load('s3://justinngbucket/Data/berka_dataset/account.asc')

In [36]:
df.printSchema()

root
 |-- account_id: integer (nullable = true)
 |-- district_id: integer (nullable = true)
 |-- frequency: string (nullable = true)
 |-- date: integer (nullable = true)



In [37]:
df.select('date').show()

+------+
|  date|
+------+
|930101|
|930101|
|930101|
|930101|
|930102|
|930102|
|930103|
|930103|
|930103|
|930103|
|930103|
|930104|
|930104|
|930104|
|930104|
|930105|
|930105|
|930106|
|930106|
|930107|
+------+
only showing top 20 rows



In [38]:
df.select(df['account_id'], df['date'] + 1).show()

+----------+----------+
|account_id|(date + 1)|
+----------+----------+
|       576|    930102|
|      3818|    930102|
|       704|    930102|
|      2378|    930102|
|      2632|    930103|
|      1972|    930103|
|      1539|    930104|
|       793|    930104|
|      2484|    930104|
|      1695|    930104|
|      1726|    930104|
|      2881|    930105|
|      2357|    930105|
|      2177|    930105|
|       485|    930105|
|       652|    930106|
|      9635|    930106|
|      1844|    930107|
|      1926|    930107|
|      2393|    930108|
+----------+----------+
only showing top 20 rows



In [39]:
df.filter(df['date'] == 930101).show()

+----------+-----------+----------------+------+
|account_id|district_id|       frequency|  date|
+----------+-----------+----------------+------+
|       576|         55|POPLATEK MESICNE|930101|
|      3818|         74|POPLATEK MESICNE|930101|
|       704|         55|POPLATEK MESICNE|930101|
|      2378|         16|POPLATEK MESICNE|930101|
+----------+-----------+----------------+------+



In [40]:
df.groupBy("date").count().show()

+------+-----+
|  date|count|
+------+-----+
|941115|    3|
|950810|    1|
|960223|    4|
|960415|    5|
|960713|    6|
|971109|    5|
|971217|    1|
|930709|    4|
|931030|    1|
|941024|    1|
|950827|    6|
|951104|    1|
|970617|    1|
|970708|    4|
|930805|    2|
|950101|    1|
|950108|    1|
|950828|    1|
|970612|    2|
|930102|    2|
+------+-----+
only showing top 20 rows



In [41]:
df.createOrReplaceTempView("accounts")

In [42]:
sqlDF = spark.sql("SELECT * FROM accounts")
sqlDF.show()

+----------+-----------+------------------+------+
|account_id|district_id|         frequency|  date|
+----------+-----------+------------------+------+
|       576|         55|  POPLATEK MESICNE|930101|
|      3818|         74|  POPLATEK MESICNE|930101|
|       704|         55|  POPLATEK MESICNE|930101|
|      2378|         16|  POPLATEK MESICNE|930101|
|      2632|         24|  POPLATEK MESICNE|930102|
|      1972|         77|  POPLATEK MESICNE|930102|
|      1539|          1|POPLATEK PO OBRATU|930103|
|       793|         47|  POPLATEK MESICNE|930103|
|      2484|         74|  POPLATEK MESICNE|930103|
|      1695|         76|  POPLATEK MESICNE|930103|
|      1726|         48|  POPLATEK MESICNE|930103|
|      2881|         70|  POPLATEK MESICNE|930104|
|      2357|         19|  POPLATEK MESICNE|930104|
|      2177|         62|  POPLATEK MESICNE|930104|
|       485|          6|POPLATEK PO OBRATU|930104|
|       652|         21|  POPLATEK MESICNE|930105|
|      9635|         70|  POPLA