## Library imports

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode,substring, length, udf
from pyspark.sql.types import *
from pyspark.sql import Row
from itertools import cycle
from pyspark.ml.regression import LinearRegression
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql import functions as F
from pyspark.sql import types as T

## Session starting


In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

## Loading data to the Dataframe

In [3]:
sqlDF = spark.sql("SELECT * FROM investments")
sqlDF.show(5)

+-----------------+----+---------------+
|raised_amount_usd|year|       category|
+-----------------+----+---------------+
|            1.5E7|2014|     E-Commerce|
|            1.4E7|2015|         Drones|
|            1.4E7|2015|  Manufacturing|
|        1870000.0|2011|             3D|
|        1870000.0|2011|Computer Vision|
+-----------------+----+---------------+
only showing top 5 rows



### Filter

In [4]:
SQLQUERY =  """
            SELECT CATEGORY, 
            CAST(YEAR AS INT), 
            SUM(RAISED_AMOUNT_USD) AS TOTAL, 
            CAST(SUM(RAISED_AMOUNT_USD) AS DECIMAL(30)) AS TOTAL_DEC 
            FROM INVESTMENTS GROUP 
            BY CATEGORY, YEAR 
            """
#  ORDER BY YEAR DESC, TOTAL DESC

In [5]:
sqlDF = spark.sql(SQLQUERY)
sqlDF.show(5)

+------------------+----+--------------+----------+
|          CATEGORY|YEAR|         TOTAL| TOTAL_DEC|
+------------------+----+--------------+----------+
|  Mobile Analytics|2010|       4.205E7|  42050000|
|Big Data Analytics|2013|  2.35683979E9|2356839790|
| Online Scheduling|2012|      9.5505E7|  95505000|
|          Creative|2012|  1.65391911E8| 165391911|
|              Apps|2008|6.3178257004E8| 631782570|
+------------------+----+--------------+----------+
only showing top 5 rows



## Vectorization of the feature matrix

In [6]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['YEAR'], outputCol = 'FEATURES')
featureDF = vectorAssembler.transform(sqlDF).select('CATEGORY', 'FEATURES', 'TOTAL')

featureDF.show(5)

+------------------+--------+--------------+
|          CATEGORY|FEATURES|         TOTAL|
+------------------+--------+--------------+
|  Mobile Analytics|[2010.0]|       4.205E7|
|Big Data Analytics|[2013.0]|  2.35683979E9|
| Online Scheduling|[2012.0]|      9.5505E7|
|          Creative|[2012.0]|  1.65391911E8|
|              Apps|[2008.0]|6.3178257004E8|
+------------------+--------+--------------+
only showing top 5 rows



In [7]:
f = featureDF.select('CATEGORY').distinct()


In [8]:
Categories = [row.CATEGORY for row in f.collect()]


In [9]:
len(Categories)

837

## Preparation of prediction data

In [10]:
# Features matrix to predict the amount for the Year 2020

l =  [(2020,)]

rdd = sc.parallelize(l)
test = rdd.map(lambda x: Row(YEAR=x[0] ))
testDF = sqlContext.createDataFrame(test)

vectorAssembler = VectorAssembler(inputCols = ['YEAR'], outputCol = 'FEATURES')
vectorDF = vectorAssembler.transform(testDF).select('FEATURES')

## Forecating model training for each sector (Linear Regression)

In [11]:
#count = 0 
summaryInfo = []
columns = ['Category', 'Gradient', 'Slope' ,'Intercept','RMSE', 'R2', 'Prediction']

for category in Categories:
    #print(category)
    categoryDF=featureDF.filter(featureDF.CATEGORY == category)
    if(categoryDF.count() > 10):
        #count +=1
        lr = LinearRegression(featuresCol = 'FEATURES', labelCol='TOTAL', maxIter=10, regParam=0.3, elasticNetParam=0.8)
        lr_model = lr.fit(categoryDF)        
        if (lr_model.summary.r2 >= .5): 
            row=(category
                 ,lr_model.coefficients
                 ,float(lr_model.coefficients[0])
                 ,lr_model.intercept
                 ,lr_model.summary.rootMeanSquaredError
                 ,lr_model.summary.r2
                 ,lr_model.transform(vectorDF).take(1)[0].prediction )
            summaryInfo.append(row)                        
summaryDF = spark.createDataFrame(summaryInfo, columns)


In [12]:
summaryDF.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|            Category|            Gradient|               Slope|           Intercept|                RMSE|                R2|          Prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|    Personal Finance|[3.09531844548871...| 3.095318445488712E8|-6.20846579539486...|1.1403333171358914E9|0.5257188832329653|  4.40774644923291E9|
|Reviews and Recom...|[9.38435051053559...| 9.384350510535592E7|-1.88076601008072...| 2.252600279853369E8|0.6748252913534014|1.4872793047460022E9|
|         Health Care|[1.08986421814503...|1.0898642181450326E9|-2.17925088637620...|2.0237455217204065E9| 0.874761105659529|2.227483427676025...|
|Application Perfo...|[1.46386390370929...| 1.463863903709295E8|-2.93690139320376...|4.7040592338191056E8|0.5668998218

## Saving to an persistant table

In [13]:
summaryDF.write.mode("overwrite").saveAsTable("summaryDF")

In [14]:
SQLQUERY =  """
            SELECT *
            FROM summaryDF
            ORDER BY Prediction DESC
            """
#  ORDER BY YEAR DESC, TOTAL DESC

sqlDF = spark.sql(SQLQUERY)
sqlDF.show(5)

+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|           Category|            Gradient|               Slope|           Intercept|                RMSE|                R2|          Prediction|
+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|      Biotechnology|[2.26309733431992...| 2.263097334319927E9|-4.52421180397470...| 5.284447122207029E9|0.8320372559570712| 4.72448113515459E10|
|         E-Commerce|[2.16222073950649...| 2.162220739506491E9|-4.32648268498422...|1.159654297718094...|0.5349317056080334|4.120320881888916E10|
|           Software|[1.85595799998956...| 1.855957999989563E9|-3.70932150071992...| 7.801318524147765E9| 0.660434615976291|3.971365925899658E10|
|             Mobile|[1.56890610209523...| 1.568906102095231E9|-3.13735007452775...| 5.523512202221243E9|0.7083421147009048|