In [205]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode,substring, length
from pyspark.sql.types import DecimalType

import matplotlib.pyplot as plt
%matplotlib inline

In [206]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [207]:
# spark is an custom SparkSession based on some config to work with Jupyter notebooks
iv = spark.read.csv("hdfs://localhost:9000/user/lavish/data/investments.csv"
                , header='true'
                , inferSchema='true')

In [208]:
type(iv)

pyspark.sql.dataframe.DataFrame

In [209]:
iv.schema.names

['company_permalink',
 'company_name',
 'company_category_list',
 'company_country_code',
 'company_state_code',
 'company_region',
 'company_city',
 'investor_permalink',
 'investor_name',
 'investor_country_code',
 'investor_state_code',
 'investor_region',
 'investor_city',
 'funding_round_permalink',
 'funding_round_type',
 'funding_round_code',
 'funded_at',
 'raised_amount_usd']

In [210]:
filteredIV = iv.filter(iv.raised_amount_usd.isNotNull())

In [211]:
splittedCategoryIV = filteredIV.select('raised_amount_usd',substring('funded_at',-4,4).alias('year')
                       , split(col("company_category_list")
                       , "[|]s*").alias("categoryArr"))

In [212]:
explodedIV=splittedCategoryIV.select('raised_amount_usd','year', explode('categoryArr').alias('category'))

In [None]:
explodedIV.show(50)

In [None]:
explodedIV.createOrReplaceTempView("investments")

In [None]:
sqlDF = spark.sql("SELECT * FROM investments")
sqlDF.show()

In [None]:
SQLQUERY =  """
            SELECT CATEGORY, YEAR, SUM(RAISED_AMOUNT_USD) AS TOTAL, 
            CAST(SUM(RAISED_AMOUNT_USD) AS DECIMAL(30)) AS TOTAL_DEC 
            FROM INVESTMENTS GROUP 
            BY CATEGORY, YEAR 
            ORDER BY YEAR DESC, TOTAL DESC
            """


In [None]:
sqlDF = spark.sql(SQLQUERY)

In [None]:
sqlDF.show()

In [None]:
sqlDF.count()

In [None]:
sqlDF.write.mode('overwrite').option("header", "true").csv('hdfs://localhost:9000/user/lavish/data/topcategories.csv')

In [None]:
# spark is an custom SparkSession based on some config to work with Jupyter notebooks
read = spark.read.csv("hdfs://localhost:9000/user/lavish/data/topcategories.csv"
                , header='true'
                , inferSchema='true')

In [None]:
read.show(100)