In [1]:
import numpy as np
import random
import pandas as pd
from pyspark.sql import Row
from hops import featurestore

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
8,application_1561468620886_0010,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
area_ids = list(range(1,51))
house_purchased_amounts = []
house_purchases_bidders = []
house_purchases_area_ids = []
for i in area_ids:
    for j in list(range(1,1000)):
        house_purchased_amounts.append(abs(np.random.exponential()*100000)/i)
        house_purchases_bidders.append(int(abs(np.random.exponential()*10)/i))
        house_purchases_area_ids.append(i)
house_purchase_ids = list(range(len(house_purchases_bidders)))
houses_sold_data  = pd.DataFrame({
        'area_id':house_purchases_area_ids,
        'house_purchase_id':house_purchase_ids,
        'number_of_bidders': house_purchases_bidders,
        'sold_for_amount': house_purchased_amounts
    })
houses_sold_data_spark_df = sqlContext.createDataFrame(houses_sold_data)

In [3]:
houses_sold_data_spark_df.show(5)

+-------+-----------------+-----------------+------------------+
|area_id|house_purchase_id|number_of_bidders|   sold_for_amount|
+-------+-----------------+-----------------+------------------+
|      1|                0|                0|22840.136793738166|
|      1|                1|                8| 78309.64488345955|
|      1|                2|                3|16265.972879369778|
|      1|                3|               11|17829.282600311155|
|      1|                4|                8| 36971.39422706205|
+-------+-----------------+-----------------+------------------+
only showing top 5 rows

In [4]:
houses_sold_data_spark_df.printSchema()

root
 |-- area_id: long (nullable = true)
 |-- house_purchase_id: long (nullable = true)
 |-- number_of_bidders: long (nullable = true)
 |-- sold_for_amount: double (nullable = true)

In [5]:
sum_houses_sold_df = houses_sold_data_spark_df.groupBy("area_id").sum()
count_houses_sold_df = houses_sold_data_spark_df.groupBy("area_id").count()
sum_count_houses_sold_df = sum_houses_sold_df.join(count_houses_sold_df, "area_id")
sum_count_houses_sold_df = sum_count_houses_sold_df \
    .withColumnRenamed("sum(number_of_bidders)", "sum_number_of_bidders") \
    .withColumnRenamed("sum(sold_for_amount)", "sum_sold_for_amount") \
    .withColumnRenamed("count", "num_rows")
def compute_average_features_houses_sold(row):
    avg_num_bidders = row.sum_number_of_bidders/float(row.num_rows)
    avg_sold_for = row.sum_sold_for_amount/float(row.num_rows)
    return Row(
        sum_number_of_bidders=row.sum_number_of_bidders, 
        sum_sold_for_amount=row.sum_sold_for_amount,
        area_id = row.area_id,
        avg_num_bidders = avg_num_bidders,
        avg_sold_for = avg_sold_for
       )
houses_sold_features_df = sum_count_houses_sold_df.rdd.map(
    lambda row: compute_average_features_houses_sold(row)
).toDF()

In [6]:
houses_sold_features_df.show(5)

+-------+--------------------+------------------+---------------------+--------------------+
|area_id|     avg_num_bidders|      avg_sold_for|sum_number_of_bidders| sum_sold_for_amount|
+-------+--------------------+------------------+---------------------+--------------------+
|     26| 0.07707707707707707|4012.4700950228616|                   77|   4008457.624927839|
|     29|0.062062062062062065| 3472.587671736636|                   62|  3469115.0840648995|
|     19|  0.1871871871871872| 5143.836954980383|                  187|   5138693.118025403|
|     22| 0.13113113113113112| 4310.862103491131|                  131|    4306551.24138764|
|      7|   0.986986986986987|13948.184284438423|                  986|1.3934236100153985E7|
+-------+--------------------+------------------+---------------------+--------------------+
only showing top 5 rows

In [7]:
houses_sold_features_df.printSchema()

root
 |-- area_id: long (nullable = true)
 |-- avg_num_bidders: double (nullable = true)
 |-- avg_sold_for: double (nullable = true)
 |-- sum_number_of_bidders: long (nullable = true)
 |-- sum_sold_for_amount: double (nullable = true)

In [8]:
featurestore.create_featuregroup(
    houses_sold_features_df,
    "houses_sold_featuregroup",
    description="aggregate features of sold houses per area",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)

Running sql: use demo_deep_learning_admin000_featurestore
Feature group created successfully