In [72]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test').getOrCreate()

In [73]:
spark

In [74]:
df = spark.read.format('csv')\
    .option('inferSchema', 'true')\
    .option('header', 'true')\
    .load('s3://hzhang502/test')

In [75]:
df.show(5)

+------------+--------+--------------------+------------+--------+----------+-------------------+-----+----------+--------+--------+--------+------------+--------------+
|        ISIN|Mnemonic|        SecurityDesc|SecurityType|Currency|SecurityID|               Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|TradedVolume|NumberOfTrades|
+------------+--------+--------------------+------------+--------+----------+-------------------+-----+----------+--------+--------+--------+------------+--------------+
|DE0007472060|     WDI|         WIRECARD AG|Common stock|     EUR|   2505101|2018-07-26 00:00:00|07:00|    161.55|   161.7|   159.8|   160.2|        8823|            45|
|DE0007164600|     SAP|         SAP SE O.N.|Common stock|     EUR|   2505077|2018-07-26 00:00:00|07:00|    101.66|  101.84|  101.66|  101.84|       53051|            46|
|DE0006047004|     HEI|HEIDELBERGCEMENT ...|Common stock|     EUR|   2505002|2018-07-26 00:00:00|07:00|     70.88|   70.88|    70.8|   70.82|        3

In [76]:
df.printSchema()

root
 |-- ISIN: string (nullable = true)
 |-- Mnemonic: string (nullable = true)
 |-- SecurityDesc: string (nullable = true)
 |-- SecurityType: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- SecurityID: integer (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Time: string (nullable = true)
 |-- StartPrice: double (nullable = true)
 |-- MaxPrice: double (nullable = true)
 |-- MinPrice: double (nullable = true)
 |-- EndPrice: double (nullable = true)
 |-- TradedVolume: integer (nullable = true)
 |-- NumberOfTrades: integer (nullable = true)



In [77]:
drop = ['ISIN', 'Currency', 'SecurityID', 'TradedVolume', 'NumberOfTrades']
for col in drop:
    df = df.drop(col)

In [78]:
df.show(5)

+--------+--------------------+------------+-------------------+-----+----------+--------+--------+--------+
|Mnemonic|        SecurityDesc|SecurityType|               Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|
+--------+--------------------+------------+-------------------+-----+----------+--------+--------+--------+
|     WDI|         WIRECARD AG|Common stock|2018-07-26 00:00:00|07:00|    161.55|   161.7|   159.8|   160.2|
|     SAP|         SAP SE O.N.|Common stock|2018-07-26 00:00:00|07:00|    101.66|  101.84|  101.66|  101.84|
|     HEI|HEIDELBERGCEMENT ...|Common stock|2018-07-26 00:00:00|07:00|     70.88|   70.88|    70.8|   70.82|
|     JEN|    JENOPTIK AG O.N.|Common stock|2018-07-26 00:00:00|07:00|      34.2|   34.26|   34.16|   34.16|
|     DBK|DEUTSCHE BANK AG ...|Common stock|2018-07-26 00:00:00|07:00|     10.33|   10.38|   10.29|  10.378|
+--------+--------------------+------------+-------------------+-----+----------+--------+--------+--------+
only showing top 5 

In [79]:
df.count()

136707

In [178]:
df.groupBy('Mnemonic').count().orderBy('count').show(25)

+--------+-----+
|Mnemonic|count|
+--------+-----+
|    null|    1|
|    I8IC|    2|
|    SPYL|    2|
|    DVEU|    2|
|    ZPR3|    2|
|    VG82|    2|
|    H4ZN|    2|
|    UNIN|    2|
|    TNE2|    2|
|    FVUI|    2|
|    INDM|    2|
|     BYG|    2|
|    VWSA|    2|
|    I8IE|    2|
|    GMMV|    2|
|    DPWA|    2|
|    I8IF|    2|
|     BRH|    2|
|    ERCA|    2|
|    SPPL|    2|
|    SYBX|    2|
|    IESE|    2|
|    NGLD|    2|
|    ASMF|    2|
|    WSOJ|    2|
+--------+-----+
only showing top 25 rows



In [80]:
df_new = df.withColumn('Date', df['Date'].cast('date'))

In [81]:
df_new = df_new.filter(df_new['SecurityType'] == 'Common stock')

In [82]:
import pyspark
import pyspark.sql.functions as fct
split_col = fct.split(df_new['Time'], ':')
df1 = df_new.withColumn('Hour', split_col.getItem(0))
df1 = df1.withColumn('Minute', split_col.getItem(1))

In [83]:
df1.show(5)

+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|Mnemonic|        SecurityDesc|SecurityType|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|Hour|Minute|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|     WDI|         WIRECARD AG|Common stock|2018-07-26|07:00|    161.55|   161.7|   159.8|   160.2|  07|    00|
|     SAP|         SAP SE O.N.|Common stock|2018-07-26|07:00|    101.66|  101.84|  101.66|  101.84|  07|    00|
|     HEI|HEIDELBERGCEMENT ...|Common stock|2018-07-26|07:00|     70.88|   70.88|    70.8|   70.82|  07|    00|
|     JEN|    JENOPTIK AG O.N.|Common stock|2018-07-26|07:00|      34.2|   34.26|   34.16|   34.16|  07|    00|
|     DBK|DEUTSCHE BANK AG ...|Common stock|2018-07-26|07:00|     10.33|   10.38|   10.29|  10.378|  07|    00|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+

In [84]:
df1.printSchema()

root
 |-- Mnemonic: string (nullable = true)
 |-- SecurityDesc: string (nullable = true)
 |-- SecurityType: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- StartPrice: double (nullable = true)
 |-- MaxPrice: double (nullable = true)
 |-- MinPrice: double (nullable = true)
 |-- EndPrice: double (nullable = true)
 |-- Hour: string (nullable = true)
 |-- Minute: string (nullable = true)



In [85]:
df1 = df1.withColumn('Hour', df1['Hour'].cast('int'))
df1 = df1.withColumn('Minute', df1['Minute'].cast('int'))

In [86]:
df2 = df1.filter(df1['Hour'] >= 9)

In [87]:
df2.printSchema()

root
 |-- Mnemonic: string (nullable = true)
 |-- SecurityDesc: string (nullable = true)
 |-- SecurityType: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- StartPrice: double (nullable = true)
 |-- MaxPrice: double (nullable = true)
 |-- MinPrice: double (nullable = true)
 |-- EndPrice: double (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)



In [88]:
df3 = df2.filter((df2['Hour'] <= 17) | ((df2['Hour'] == 17) & (df2['Minute'] <= 30)))

In [89]:
df3.show(10)

+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|Mnemonic|        SecurityDesc|SecurityType|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|Hour|Minute|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|     FRA|FRAPORT AG FFM.AI...|Common stock|2018-07-25|14:00|     85.64|   85.66|   85.64|   85.66|  14|     0|
|     ADJ|ADO PROPERTIES S....|Common stock|2018-07-25|14:00|     48.26|   48.26|   48.26|   48.26|  14|     0|
|    SOBA|AT + T INC.      ...|Common stock|2018-07-25|14:00|     26.28|   26.28|   26.28|   26.28|  14|     0|
|     O2D|TELEFONICA DTLD H...|Common stock|2018-07-25|14:00|      3.94|   3.943|    3.94|   3.943|  14|     0|
|     DWS|DWS GROUP GMBH+CO...|Common stock|2018-07-25|14:00|    27.345|  27.395|  27.345|  27.395|  14|     0|
|     CON| CONTINENTAL AG O.N.|Common stock|2018-07-25|14:00|     193.5|   193.5|   193.5|   193.5|  14|

In [90]:
df3.createOrReplaceTempView('df3')

In [91]:
#dates = spark.sql("Select distinct Date from df3")

In [92]:
#dates.show(100)

+----------+
|      Date|
+----------+
|2018-07-25|
|2018-07-26|
+----------+



In [93]:
#time = spark.sql("Select distinct Time from df3")

In [177]:
#time.show(1)

+-----+
| Time|
+-----+
|09:10|
+-----+
only showing top 1 row



In [49]:
#dates.createOrReplaceTempView('date')

In [96]:
# xie cheng function
stock1 = df3.filter(df3['Mnemonic'] == 'AMZ')
stock1.show(10)

+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|Mnemonic|        SecurityDesc|SecurityType|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|Hour|Minute|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|14:00|   1569.95| 1570.75| 1569.64| 1570.75|  14|     0|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|14:01|    1570.5|  1570.5|  1570.5|  1570.5|  14|     1|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|14:08|   1566.01| 1566.02| 1566.01| 1566.02|  14|     8|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|14:13|   1566.32| 1566.32| 1566.32| 1566.32|  14|    13|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|14:15|    1566.6|  1566.6| 1565.32| 1565.32|  14|    15|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|14:17|   1567.78| 1567.78| 1567.78| 1567.78|  14|

In [135]:
stock1.createOrReplaceTempView('stock1')

In [97]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

window = Window.partitionBy(stock1['Date']).orderBy(df['MaxPrice'].desc())

max_df = stock1.select('*', rank().over(window).alias('Max')).filter(col('Max') <= 1)

In [98]:
max_df.show()

+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---+
|Mnemonic|        SecurityDesc|SecurityType|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|Hour|Minute|Max|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---+
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|15:35|   1573.08| 1573.08| 1573.08| 1573.08|  15|    35|  1|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-26|13:32|   1576.74| 1578.27| 1576.74| 1578.27|  13|    32|  1|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---+



In [100]:
max_df.createOrReplaceTempView('max')

In [101]:
window1 = Window.partitionBy(stock1['Date']).orderBy(df['MinPrice'].asc())

min_df = stock1.select('*', rank().over(window1).alias('Min')).filter(col('Min') <= 1)

In [161]:
min_df.show()

+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---+
|Mnemonic|        SecurityDesc|SecurityType|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|Hour|Minute|Min|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---+
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|13:32|    1560.0|  1560.0| 1558.64| 1558.64|  13|    32|  1|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-26|13:53|   1556.75| 1556.75| 1556.75| 1556.75|  13|    53|  1|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---+



In [102]:
min_df.createOrReplaceTempView('min')

In [131]:
from pyspark.sql.functions import rank, desc, col
ranked = stock1.withColumn("time_rank",rank().over(Window.partitionBy(stock1['Date']).orderBy("Hour", "Minute")))

In [128]:
#ranked.createOrReplaceTempView('ranked')

In [129]:
from pyspark.sql.functions import rank, desc, col
rankdown = ranked.withColumn("time_rankdown",rank().over(Window.partitionBy(ranked['Date']).orderBy(desc("Hour"), desc("Minute"))))

In [132]:
#rankdown.createOrReplaceTempView('rankdown')

In [133]:
start_end = rankdown.filter((col('time_rank') == 1) | (col('time_rankdown') == 1))
start_end.show(10)

+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---------+-------------+
|Mnemonic|        SecurityDesc|SecurityType|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|Hour|Minute|time_rank|time_rankdown|
+--------+--------------------+------------+----------+-----+----------+--------+--------+--------+----+------+---------+-------------+
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|15:35|   1573.08| 1573.08| 1573.08| 1573.08|  15|    35|      124|            1|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-25|09:01|   1567.61| 1567.61| 1567.61| 1567.61|   9|     1|        1|          124|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-26|15:35|   1566.98| 1566.98| 1566.98| 1566.98|  15|    35|      202|            1|
|     AMZ|AMAZON.COM INC.  ...|Common stock|2018-07-26|09:00|   1566.49| 1566.49| 1566.49| 1566.49|   9|     0|        1|          202|
+--------+--------------------+------------+----

In [134]:
start_end.createOrReplaceTempView('start_end')

In [158]:
final = spark.sql("""SELECT s.Mnemonic, s.Date, a.MaxPrice
FROM max a INNER JOIN stock1 s On a.Max == 1 AND s.Date == a.Date
GROUP BY s.Mnemonic, s.Date, a.MaxPrice
""")

In [159]:
final.show()

+--------+----------+--------+
|Mnemonic|      Date|MaxPrice|
+--------+----------+--------+
|     AMZ|2018-07-26| 1578.27|
|     AMZ|2018-07-25| 1573.08|
+--------+----------+--------+



In [160]:
final.createOrReplaceTempView('final')

In [163]:
final = spark.sql("""SELECT f.Mnemonic, f.Date, f.MaxPrice, i.MinPrice
FROM min i INNER JOIN final f On i.Min == 1 AND i.Date == f.Date
GROUP BY f.Mnemonic, f.Date, f.MaxPrice, i.MinPrice
""")

In [164]:
final.show()

+--------+----------+--------+--------+
|Mnemonic|      Date|MaxPrice|MinPrice|
+--------+----------+--------+--------+
|     AMZ|2018-07-26| 1578.27| 1556.75|
|     AMZ|2018-07-25| 1573.08| 1558.64|
+--------+----------+--------+--------+



In [166]:
final.createOrReplaceTempView('final')

In [167]:
final = spark.sql("""SELECT f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, s.StartPrice
FROM start_end s INNER JOIN final f On s.time_rank == 1 AND s.Date == f.Date
GROUP BY f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, s.StartPrice
""")

In [168]:
final.show()

+--------+----------+--------+--------+----------+
|Mnemonic|      Date|MaxPrice|MinPrice|StartPrice|
+--------+----------+--------+--------+----------+
|     AMZ|2018-07-26| 1578.27| 1556.75|   1566.49|
|     AMZ|2018-07-25| 1573.08| 1558.64|   1567.61|
+--------+----------+--------+--------+----------+



In [169]:
final.createOrReplaceTempView('final')
final = spark.sql("""SELECT f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, f.StartPrice, s.EndPrice
FROM start_end s INNER JOIN final f On s.time_rankdown == 1 AND s.Date == f.Date
GROUP BY f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, f.StartPrice, s.EndPrice
""")
final.show()

+--------+----------+--------+--------+----------+--------+
|Mnemonic|      Date|MaxPrice|MinPrice|StartPrice|EndPrice|
+--------+----------+--------+--------+----------+--------+
|     AMZ|2018-07-26| 1578.27| 1556.75|   1566.49| 1566.98|
|     AMZ|2018-07-25| 1573.08| 1558.64|   1567.61| 1573.08|
+--------+----------+--------+--------+----------+--------+



In [172]:
final = final.orderBy('Date')

In [None]:
final.write.csv('s3://hzhang502/amz')

In [192]:
final.show()

+--------+----------+--------+--------+----------+--------+
|Mnemonic|      Date|MaxPrice|MinPrice|StartPrice|EndPrice|
+--------+----------+--------+--------+----------+--------+
|     AMZ|2018-07-25| 1573.08| 1558.64|   1567.61| 1573.08|
|     AMZ|2018-07-26| 1578.27| 1556.75|   1566.49| 1566.98|
+--------+----------+--------+--------+----------+--------+



In [193]:
final.write.format("csv").option("header","true").mode("Overwrite").save("s3://hzhang502/amz")

In [179]:
# The Mnemonics of the 20 stocks we want to analyze
stocks = ['AMZ', 'EBA', 'NFC', 'FB2A', 'MSF', 'TWR', 'DBK', 'DAI', 'CBK', 'ALV', 'BMW', 'AIR', 'VOW3', 'SIE', 'PHIA', 'ADS', 'CON', 'BAS', 'BAYN', '1COV']

In [181]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc, col

def create_csv(stockname):
    stock1 = df3.filter(df3['Mnemonic'] == stockname)
    stock1.createOrReplaceTempView('stock1')
    window = Window.partitionBy(stock1['Date']).orderBy(df['MaxPrice'].desc())
    max_df = stock1.select('*', rank().over(window).alias('Max')).filter(col('Max') <= 1)
    max_df.createOrReplaceTempView('max')
    window1 = Window.partitionBy(stock1['Date']).orderBy(df['MinPrice'].asc())
    min_df = stock1.select('*', rank().over(window1).alias('Min')).filter(col('Min') <= 1)
    min_df.createOrReplaceTempView('min')
    ranked = stock1.withColumn("time_rank",rank().over(Window.partitionBy(stock1['Date']).orderBy("Hour", "Minute")))
    rankdown = ranked.withColumn("time_rankdown",rank().over(Window.partitionBy(ranked['Date']).orderBy(desc("Hour"), desc("Minute"))))
    start_end = rankdown.filter((col('time_rank') == 1) | (col('time_rankdown') == 1))
    start_end.createOrReplaceTempView('start_end')
    final = spark.sql("""SELECT s.Mnemonic, s.Date, a.MaxPrice
FROM max a INNER JOIN stock1 s On a.Max == 1 AND s.Date == a.Date
GROUP BY s.Mnemonic, s.Date, a.MaxPrice
""")
    final.createOrReplaceTempView('final')
    final = spark.sql("""SELECT f.Mnemonic, f.Date, f.MaxPrice, i.MinPrice
FROM min i INNER JOIN final f On i.Min == 1 AND i.Date == f.Date
GROUP BY f.Mnemonic, f.Date, f.MaxPrice, i.MinPrice
""")
    final.createOrReplaceTempView('final')
    final = spark.sql("""SELECT f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, s.StartPrice
FROM start_end s INNER JOIN final f On s.time_rank == 1 AND s.Date == f.Date
GROUP BY f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, s.StartPrice
""")
    final.createOrReplaceTempView('final')
    final = spark.sql("""SELECT f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, f.StartPrice, s.EndPrice
FROM start_end s INNER JOIN final f On s.time_rankdown == 1 AND s.Date == f.Date
GROUP BY f.Mnemonic, f.Date, f.MaxPrice, f.MinPrice, f.StartPrice, s.EndPrice
""")
    final = final.orderBy('Date')
    path = 's3://hzhang502/' + stockname
    final.write.format("csv").option("header","true").mode("Overwrite")
    .save("s3a://test_system/Output/Test_Result/rawdata")

In [182]:
for s in stocks:
    create_csv(s)

In [194]:
testpd = spark.read.format('csv')\
    .option('inferSchema', 'true')\
    .option('header', 'true')\
    .load('s3://hzhang502/amz')

In [195]:
testpd.show()

+--------+-------------------+--------+--------+----------+--------+
|Mnemonic|               Date|MaxPrice|MinPrice|StartPrice|EndPrice|
+--------+-------------------+--------+--------+----------+--------+
|     AMZ|2018-07-25 00:00:00| 1573.08| 1558.64|   1567.61| 1573.08|
|     AMZ|2018-07-26 00:00:00| 1578.27| 1556.75|   1566.49| 1566.98|
+--------+-------------------+--------+--------+----------+--------+



In [196]:
import pandas as pd

In [198]:
dfpd = pd.DataFrame()


In [201]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [None]:
dfpd = testpd.toPandas()

In [None]:
Tech: AMZ EBA NFC FB2A MSF TWR
Fin : DBK DAI CBK  ALV (Allianz) 
Industry: BMW AIR VOW3
manufacture：SIE PHIA ADS CON (car parts)
Chemical: BAS (basf) BAYN 1COV