# Apple Stock

### Introduction:

We are going to use Apple's stock price.


### Step 1. Import the necessary libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = SparkSession.builder.appName("Apple_Stock").getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/09_Time_Series/Apple_Stock/appl_1980_2014.csv)

### Step 3. Assign it to a variable apple

In [2]:
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/09_Time_Series/Apple_Stock/appl_1980_2014.csv'

from pyspark import SparkFiles
spark.sparkContext.addFile(url)


apple = spark.read.csv(SparkFiles.get("appl_1980_2014.csv"),header=True, inferSchema= True)


### Step 4.  Check out the type of the columns

In [3]:
apple.dtypes

[('Date', 'string'),
 ('Open', 'double'),
 ('High', 'double'),
 ('Low', 'double'),
 ('Close', 'double'),
 ('Volume', 'int'),
 ('Adj Close', 'double')]

### Step 5. Transform the Date column as a datetime type

In [7]:
apple = apple.withColumn('Date', F.to_date('Date'))

In [70]:
apple = apple.withColumnRenamed('Adj Close', 'Adj_Close')

In [71]:
apple.dtypes

[('Date', 'date'),
 ('Open', 'double'),
 ('High', 'double'),
 ('Low', 'double'),
 ('Close', 'double'),
 ('Volume', 'int'),
 ('Adj_Close', 'double')]

### Step 6.  Set the date as the index

In [72]:
# na

### Step 7.  Is there any duplicate dates?

In [73]:
apple.select('Date').distinct().count() - apple.select('Date').count() 

0

### Step 8.  Ops...it seems the index is from the most recent date. Make the first entry the oldest date.

In [74]:
apple = apple.orderBy('Date', ascending=True)

In [76]:
apple.head(5)

[Row(Date=datetime.date(1980, 12, 12), Open=28.75, High=28.87, Low=28.75, Close=28.75, Volume=117258400, Adj_Close=0.45),
 Row(Date=datetime.date(1980, 12, 15), Open=27.38, High=27.38, Low=27.25, Close=27.25, Volume=43971200, Adj_Close=0.42),
 Row(Date=datetime.date(1980, 12, 16), Open=25.37, High=25.37, Low=25.25, Close=25.25, Volume=26432000, Adj_Close=0.39),
 Row(Date=datetime.date(1980, 12, 17), Open=25.87, High=26.0, Low=25.87, Close=25.87, Volume=21610400, Adj_Close=0.4),
 Row(Date=datetime.date(1980, 12, 18), Open=26.63, High=26.75, Low=26.63, Close=26.63, Volume=18362400, Adj_Close=0.41)]

### Step 9. Get the last business day of each month

In [108]:

apple = apple.withColumn('id', F.monotonically_increasing_id())

In [109]:
apple.show()

+----------+-----+-----+-----+-----+---------+---------+---+
|      Date| Open| High|  Low|Close|   Volume|Adj_Close| id|
+----------+-----+-----+-----+-----+---------+---------+---+
|1980-12-12|28.75|28.87|28.75|28.75|117258400|     0.45|  0|
|1980-12-15|27.38|27.38|27.25|27.25| 43971200|     0.42|  1|
|1980-12-16|25.37|25.37|25.25|25.25| 26432000|     0.39|  2|
|1980-12-17|25.87| 26.0|25.87|25.87| 21610400|      0.4|  3|
|1980-12-18|26.63|26.75|26.63|26.63| 18362400|     0.41|  4|
|1980-12-19|28.25|28.38|28.25|28.25| 12157600|     0.44|  5|
|1980-12-22|29.63|29.75|29.63|29.63|  9340800|     0.46|  6|
|1980-12-23|30.88| 31.0|30.88|30.88| 11737600|     0.48|  7|
|1980-12-24| 32.5|32.63| 32.5| 32.5| 12000800|     0.51|  8|
|1980-12-26| 35.5|35.62| 35.5| 35.5| 13893600|     0.55|  9|
|1980-12-29| 36.0|36.13| 36.0| 36.0| 23290400|     0.56| 10|
|1980-12-30|35.25|35.25|35.12|35.12| 17220000|     0.55| 11|
|1980-12-31|34.25|34.25|34.13|34.13|  8937600|     0.53| 12|
|1981-01-02| 34.5|34.75|

In [115]:
def resample_data(pdf):
    pdf['Date'] = pd.to_datetime(pdf.Date)
    pdf.resample('BM', on='Date').mean()
    return pdf

In [117]:
schema = "Date date, Open double, High double, Low double, Close double, Volume int, Adj_Close double, id int"
month_apple = apple.groupBy('id').applyInPandas(resample_data, schema)

In [None]:
# doesn't the way I expect with this rolling
month_apple.collect()
month_apple.show(1)

### Step 10.  What is the difference in days between the first day and the oldest

### Step 11.  How many months in the data we have?

### Step 12. Plot the 'Adj Close' value. Set the size of the figure to 13.5 x 9 inches

### BONUS: Create your own question and answer it.