In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode,substring, length, udf
from pyspark.sql.types import DecimalType, StringType
from itertools import cycle
from pyspark.ml.regression import LinearRegression
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql import functions as F
from pyspark.sql import types as T

In [None]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
# spark is an custom SparkSession based on some config to work with Jupyter notebooks
iv = spark.read.csv("hdfs://localhost:9000/user/lavish/data/investments.csv"
                , header='true'
                , inferSchema='true')

In [None]:
startYear=1995
endYear=2015

In [None]:
type(iv)

In [None]:
iv.schema.names

In [None]:
filteredIV = iv.filter(iv.raised_amount_usd.isNotNull())

In [None]:
from pyspark.sql.types import DoubleType
splittedCategoryIV = filteredIV.select('raised_amount_usd',  substring('funded_at',-4,4).cast('int').alias('year')
                       , split(col("company_category_list")
                       , "[|]s*").alias("categoryArr"))

In [None]:
explodedIV=splittedCategoryIV.select('raised_amount_usd','year', explode('categoryArr').alias('category'))

In [None]:
explodedIV.show(10)

In [None]:
explodedIV.createOrReplaceTempView("investments")

In [None]:
sqlDF = spark.sql("SELECT * FROM investments")
sqlDF.show(5)

### Year Wise

In [None]:
SQLQUERY =  """
            SELECT CATEGORY, 
            CAST(YEAR AS INT), 
            SUM(RAISED_AMOUNT_USD) AS TOTAL, 
            CAST(SUM(RAISED_AMOUNT_USD) AS DECIMAL(30)) AS TOTAL_DEC 
            FROM INVESTMENTS GROUP 
            BY CATEGORY, YEAR 
            """
#  ORDER BY YEAR DESC, TOTAL DESC

In [None]:
sqlDF = spark.sql(SQLQUERY)
sqlDF.show(5)

In [None]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['YEAR'], outputCol = 'FEATURES')
featureDF = vectorAssembler.transform(sqlDF).select('CATEGORY', 'FEATURES', 'TOTAL')

featureDF.show(5)

In [None]:
f = featureDF.select('CATEGORY').distinct()


In [None]:
topCategories = [row.CATEGORY for row in f.collect()]


In [None]:
len(topCategories)

In [None]:
# Features matrix to predict the amount for the Year 2020

l =  [(2020,)]

rdd = sc.parallelize(l)
test = rdd.map(lambda x: Row(YEAR=x[0] ))
testDF = sqlContext.createDataFrame(test)

vectorAssembler = VectorAssembler(inputCols = ['YEAR'], outputCol = 'FEATURES')
vectorDF = vectorAssembler.transform(testDF).select('FEATURES')

In [None]:
#count = 0 
statInfo = {}
for category in topCategories:
    #print(category)
    categoryDF=featureDF.filter(featureDF.CATEGORY == category)
    if(categoryDF.count() > 10):
        #count +=1
        lr = LinearRegression(featuresCol = 'FEATURES', labelCol='TOTAL', maxIter=10, regParam=0.3, elasticNetParam=0.8)
        lr_model = lr.fit(categoryDF)
        if (lr_model.summary.r2 >= .5):
            statInfo[category] = { 'Gradient' : str(lr_model.coefficients),
                                   'Intercept' : str(lr_model.intercept),
                                   'RMSE' : str(lr_model.summary.rootMeanSquaredError),
                                   'R2' : str(lr_model.summary.r2),
                                   'Pediction' : lr_model.transform(vectorDF).take(1)[0].prediction                               }

print (statInfo)

In [None]:
startInfo