In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode,substring, length, udf
from pyspark.sql.types import DecimalType, StringType
from itertools import cycle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql import functions as F
from pyspark.sql import types as T

In [3]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
# spark is an custom SparkSession based on some config to work with Jupyter notebooks
iv = spark.read.csv("hdfs://localhost:9000/user/lavish/data/investments.csv"
                , header='true'
                , inferSchema='true')

In [5]:
startYear=1995
endYear=2015

In [6]:
type(iv)

pyspark.sql.dataframe.DataFrame

In [7]:
iv.schema.names

['company_permalink',
 'company_name',
 'company_category_list',
 'company_country_code',
 'company_state_code',
 'company_region',
 'company_city',
 'investor_permalink',
 'investor_name',
 'investor_country_code',
 'investor_state_code',
 'investor_region',
 'investor_city',
 'funding_round_permalink',
 'funding_round_type',
 'funding_round_code',
 'funded_at',
 'raised_amount_usd']

In [8]:
filteredIV = iv.filter(iv.raised_amount_usd.isNotNull())

In [9]:
from pyspark.sql.types import DoubleType
splittedCategoryIV = filteredIV.select('raised_amount_usd',  substring('funded_at',-4,4).cast('int').alias('year')
                       , split(col("company_category_list")
                       , "[|]s*").alias("categoryArr"))

In [10]:
explodedIV=splittedCategoryIV.select('raised_amount_usd','year', explode('categoryArr').alias('category'))

In [11]:
explodedIV.show(10)

+-----------------+----+-----------+
|raised_amount_usd|year|   category|
+-----------------+----+-----------+
|        2000000.0|2008|Curated Web|
|          41250.0|2014|      Games|
|            2.0E7|2015|  Analytics|
|        3000000.0|2013|  Analytics|
|            2.0E7|2015|  Analytics|
|        1700000.0|2013|  Analytics|
|        8900000.0|2014|  Analytics|
|            2.0E7|2015|  Analytics|
|            2.0E7|2015|  Analytics|
|        8900000.0|2014|  Analytics|
+-----------------+----+-----------+
only showing top 10 rows



In [12]:
explodedIV.createOrReplaceTempView("investments")

In [13]:
sqlDF = spark.sql("SELECT * FROM investments")
sqlDF.show()

+-----------------+----+-----------+
|raised_amount_usd|year|   category|
+-----------------+----+-----------+
|        2000000.0|2008|Curated Web|
|          41250.0|2014|      Games|
|            2.0E7|2015|  Analytics|
|        3000000.0|2013|  Analytics|
|            2.0E7|2015|  Analytics|
|        1700000.0|2013|  Analytics|
|        8900000.0|2014|  Analytics|
|            2.0E7|2015|  Analytics|
|            2.0E7|2015|  Analytics|
|        8900000.0|2014|  Analytics|
|        1700000.0|2013|  Analytics|
|        1700000.0|2013|  Analytics|
|        8900000.0|2014|  Analytics|
|        8900000.0|2014|  Analytics|
|        8900000.0|2014|  Analytics|
|        8900000.0|2014|  Analytics|
|         150000.0|2014|     Mobile|
|        1000050.0|2011|     Mobile|
|        1000050.0|2011|     Mobile|
|         150000.0|2014|     Mobile|
+-----------------+----+-----------+
only showing top 20 rows



### Year Wise

In [14]:
SQLQUERY =  """
            SELECT CATEGORY, 
            CAST(YEAR AS INT), 
            SUM(RAISED_AMOUNT_USD) AS TOTAL, 
            CAST(SUM(RAISED_AMOUNT_USD) AS DECIMAL(30)) AS TOTAL_DEC 
            FROM INVESTMENTS GROUP 
            BY CATEGORY, YEAR 
            """
#  ORDER BY YEAR DESC, TOTAL DESC

In [15]:
sqlDF = spark.sql(SQLQUERY)
sqlDF.show()

+--------------------+----+-----------------+----------+
|            CATEGORY|YEAR|            TOTAL| TOTAL_DEC|
+--------------------+----+-----------------+----------+
|      Interest Graph|2011|           3.28E7|  32800000|
|           Insurance|2015|  5.70529580149E9|5705295801|
|  Big Data Analytics|2013|     2.35683979E9|2356839790|
|           Aerospace|2014|4.6013734510098E8| 460137345|
|               Audio|2005|          1.058E8| 105800000|
|Cloud Infrastructure|2010|     1.38360796E8| 138360796|
|    Cloud Management|2010|         5.4528E8| 545280000|
|                Apps|2008|   6.3178257004E8| 631782570|
| Insurance Companies|2015|    1.523851749E7|  15238517|
|    Mobile Analytics|2010|          4.205E7|  42050000|
|          Networking|2009|     5.39435025E8| 539435025|
|               Local|2010|     1.50161942E8| 150161942|
|Information Techn...|2000|     6.25112115E8| 625112115|
|       Home & Garden|2007|         5.0605E8| 506050000|
|        Ad Targeting|2010|    

In [16]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['YEAR'], outputCol = 'FEATURES')
featureDF = vectorAssembler.transform(sqlDF).select('CATEGORY', 'FEATURES', 'TOTAL')

featureDF.show()

+--------------------+--------+-----------------+
|            CATEGORY|FEATURES|            TOTAL|
+--------------------+--------+-----------------+
|      Interest Graph|[2011.0]|           3.28E7|
|           Insurance|[2015.0]|  5.70529580149E9|
|  Big Data Analytics|[2013.0]|     2.35683979E9|
|           Aerospace|[2014.0]|4.6013734510098E8|
|               Audio|[2005.0]|          1.058E8|
|Cloud Infrastructure|[2010.0]|     1.38360796E8|
|    Cloud Management|[2010.0]|         5.4528E8|
|                Apps|[2008.0]|   6.3178257004E8|
| Insurance Companies|[2015.0]|    1.523851749E7|
|    Mobile Analytics|[2010.0]|          4.205E7|
|          Networking|[2009.0]|     5.39435025E8|
|               Local|[2010.0]|     1.50161942E8|
|Information Techn...|[2000.0]|     6.25112115E8|
|       Home & Garden|[2007.0]|         5.0605E8|
|        Ad Targeting|[2010.0]|       4.191857E8|
|Communications Ha...|[2011.0]|         5.5758E8|
|            Creative|[2012.0]|     1.65391911E8|


In [17]:
f = featureDF.select('CATEGORY').distinct()


In [18]:
topCategories = [row.CATEGORY for row in f.collect()]


In [19]:
len(topCategories)

837

In [29]:
count = 0 
for category in topCategories:
    categoryDF=featureDF.filter(featureDF.CATEGORY == category)
    if(categoryDF.count() > 10):
        count +=1

print(count)

388


In [30]:
categoryDF=featureDF.filter(featureDF.CATEGORY == 'Software')

In [None]:
lr = LinearRegression(featuresCol = 'FEATURES', labelCol='TOTAL', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(f)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
dict = {}

for row in topCategoriesDFfrom1995.collect():    
    if (row.CATEGORY in dict ):
        dict[row.CATEGORY]['Y'].append(row.YEAR)
        dict[row.CATEGORY]['T'].append(row.TOTAL)
    else:
        dict[row.CATEGORY] = { 'Y': [row.YEAR]  , 'T':[row.TOTAL]}