In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test').getOrCreate()

In [2]:
spark

In [76]:
df = spark.read.format('csv')\
    .option('inferSchema', 'true')\
    .option('header', 'true')\
    .load('s3://hzhang502/test')

In [7]:
df.show(5)

+------------+--------+--------------------+------------+--------+----------+-------------------+-----+----------+--------+--------+--------+------------+--------------+
|        ISIN|Mnemonic|        SecurityDesc|SecurityType|Currency|SecurityID|               Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|TradedVolume|NumberOfTrades|
+------------+--------+--------------------+------------+--------+----------+-------------------+-----+----------+--------+--------+--------+------------+--------------+
|DE000BFB0019|     B4B|  METRO AG   ST O.N.|Common stock|     EUR|   2532386|2018-10-08 00:00:00|07:00|     13.53|   13.53|  13.525|  13.525|        2614|             5|
|DE0006969603|     PUM|             PUMA SE|Common stock|     EUR|   2505067|2018-10-08 00:00:00|07:00|     421.5|   421.5|   420.5|   421.5|          95|             4|
|NL0011375019|     SNH|STEINHOFF INT.HLD...|Common stock|     EUR|   2506267|2018-10-08 00:00:00|07:00|     0.141|   0.141|   0.141|   0.141|      264

In [77]:
df.printSchema()

root
 |-- ISIN: string (nullable = true)
 |-- Mnemonic: string (nullable = true)
 |-- SecurityDesc: string (nullable = true)
 |-- SecurityType: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- SecurityID: integer (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Time: string (nullable = true)
 |-- StartPrice: double (nullable = true)
 |-- MaxPrice: double (nullable = true)
 |-- MinPrice: double (nullable = true)
 |-- EndPrice: double (nullable = true)
 |-- TradedVolume: integer (nullable = true)
 |-- NumberOfTrades: integer (nullable = true)



In [78]:
drop = ['ISIN', 'Currency', 'SecurityID', 'TradedVolume', 'NumberOfTrades']
for col in drop:
    df = df.drop(col)

In [79]:
df.show(5)

+--------+--------------------+------------+-------------------+-----+----------+--------+--------+--------+
|Mnemonic|        SecurityDesc|SecurityType|               Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|
+--------+--------------------+------------+-------------------+-----+----------+--------+--------+--------+
|     B4B|  METRO AG   ST O.N.|Common stock|2018-10-08 00:00:00|07:00|     13.53|   13.53|  13.525|  13.525|
|     PUM|             PUMA SE|Common stock|2018-10-08 00:00:00|07:00|     421.5|   421.5|   420.5|   421.5|
|     SNH|STEINHOFF INT.HLD...|Common stock|2018-10-08 00:00:00|07:00|     0.141|   0.141|   0.141|   0.141|
|     IGY|INNOGY SE  INH. O.N.|Common stock|2018-10-08 00:00:00|07:00|     38.31|   38.45|   38.31|   38.45|
|     IFX|INFINEON TECH.AG ...|Common stock|2018-10-08 00:00:00|07:00|      19.6|    19.6|   19.55|   19.56|
+--------+--------------------+------------+-------------------+-----+----------+--------+--------+--------+
only showing top 5 

In [80]:
df.count()

96351

In [81]:
df_new = df.withColumn('Date', df['Date'].cast('date'))

In [82]:
df_new = df_new.filter(df_new['SecurityType'] == 'Common stock')

In [83]:
import pyspark
split_col = pyspark.sql.functions.split(df_new['Time'], ':')
df1 = df_new.withColumn('Hour', split_col.getItem(0))
df1 = df1.withColumn('Minute', split_col.getItem(1))

In [84]:
df1.show(5)

+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|Mnemonic|        SecurityDesc|SecurityType|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|Hour|Minute|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|     B4B|  METRO AG   ST O.N.|Common stock|2018-10-08|07:00|     13.53|   13.53|  13.525|  13.525|  07|    00|
|     PUM|             PUMA SE|Common stock|2018-10-08|07:00|     421.5|   421.5|   420.5|   421.5|  07|    00|
|     SNH|STEINHOFF INT.HLD...|Common stock|2018-10-08|07:00|     0.141|   0.141|   0.141|   0.141|  07|    00|
|     IGY|INNOGY SE  INH. O.N.|Common stock|2018-10-08|07:00|     38.31|   38.45|   38.31|   38.45|  07|    00|
|     IFX|INFINEON TECH.AG ...|Common stock|2018-10-08|07:00|      19.6|    19.6|   19.55|   19.56|  07|    00|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+

In [47]:
df1.printSchema()

root
 |-- Mnemonic: string (nullable = true)
 |-- SecurityDesc: string (nullable = true)
 |-- SecurityType: string (nullable = true)
 |-- SecurityID: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- StartPrice: double (nullable = true)
 |-- MaxPrice: double (nullable = true)
 |-- MinPrice: double (nullable = true)
 |-- EndPrice: double (nullable = true)
 |-- Hour: string (nullable = true)
 |-- Minute: string (nullable = true)



In [49]:
df1 = df1.withColumn('Hour', df1['Hour'].cast('int'))
df1 = df1.withColumn('Minute', df1['Minute'].cast('int'))

In [52]:
df2 = df1.filter(df1['Hour'] >= 8)

In [66]:
df2.printSchema()

root
 |-- Mnemonic: string (nullable = true)
 |-- SecurityDesc: string (nullable = true)
 |-- SecurityType: string (nullable = true)
 |-- SecurityID: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- StartPrice: double (nullable = true)
 |-- MaxPrice: double (nullable = true)
 |-- MinPrice: double (nullable = true)
 |-- EndPrice: double (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)



In [73]:
df3 = df2.filter((df2['Hour'] <= 19) | ((df2['Hour'] == 20) & (df2['Minute'] == 0)))

In [91]:
df3.write.csv('s3://hzhang502/result')