In [1]:
import numpy as np
import random
import pandas as pd
from pyspark.sql import Row
from hops import featurestore

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7,application_1561468620886_0009,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
area_ids = list(range(1,51))
house_sizes = []
house_worths = []
house_ages = []
house_area_ids = []
for i in area_ids:
    for j in list(range(1,100)):
        house_sizes.append(abs(np.random.normal()*1000)/i)
        house_worths.append(abs(np.random.normal()*10000)/i)
        house_ages.append(abs(np.random.normal()*10000)/i)
        house_area_ids.append(i)
house_ids = list(range(len(house_area_ids)))
houses_for_sale_data  = pd.DataFrame({
        'area_id':house_area_ids,
        'house_id':house_ids,
        'house_worth': house_worths,
        'house_age': house_ages,
        'house_size': house_sizes
    })
houses_for_sale_data_spark_df = sqlContext.createDataFrame(houses_for_sale_data)

In [3]:
houses_for_sale_data_spark_df.show(5)

+-------+--------+------------------+------------------+------------------+
|area_id|house_id|       house_worth|         house_age|        house_size|
+-------+--------+------------------+------------------+------------------+
|      1|       0|3601.8802099057352|  7795.06845896484|456.58545181313565|
|      1|       1| 8263.477857564954|  9391.53780819243|1679.5500736138883|
|      1|       2| 7441.333488202338| 142.7748498258131| 549.3300029403817|
|      1|       3| 5953.851415439113| 97.83374519792382|2435.0608407543477|
|      1|       4| 12791.24528646675|1086.1390807961527| 741.7169220300896|
+-------+--------+------------------+------------------+------------------+
only showing top 5 rows

In [4]:
houses_for_sale_data_spark_df.printSchema()

root
 |-- area_id: long (nullable = true)
 |-- house_id: long (nullable = true)
 |-- house_worth: double (nullable = true)
 |-- house_age: double (nullable = true)
 |-- house_size: double (nullable = true)

In [5]:
sum_houses_for_sale_df = houses_for_sale_data_spark_df.groupBy("area_id").sum()
count_houses_for_sale_df = houses_for_sale_data_spark_df.groupBy("area_id").count()
sum_count_houses_for_sale_df = sum_houses_for_sale_df.join(count_houses_for_sale_df, "area_id")
sum_count_houses_for_sale_df = sum_count_houses_for_sale_df \
    .withColumnRenamed("sum(house_age)", "sum_house_age") \
    .withColumnRenamed("sum(house_worth)", "sum_house_worth") \
    .withColumnRenamed("sum(house_size)", "sum_house_size") \
    .withColumnRenamed("count", "num_rows")
def compute_average_features_house_for_sale(row):
    avg_house_worth = row.sum_house_worth/float(row.num_rows)
    avg_house_size = row.sum_house_size/float(row.num_rows)
    avg_house_age = row.sum_house_age/float(row.num_rows)
    return Row(
        sum_house_worth=row.sum_house_worth, 
        sum_house_age=row.sum_house_age,
        sum_house_size=row.sum_house_size,
        area_id = row.area_id,
        avg_house_worth = avg_house_worth,
        avg_house_size = avg_house_size,
        avg_house_age = avg_house_age
       )
houses_for_sale_features_df = sum_count_houses_for_sale_df.rdd.map(
    lambda row: compute_average_features_house_for_sale(row)
).toDF()

In [6]:
houses_for_sale_features_df.show(5)

+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|area_id|     avg_house_age|    avg_house_size|   avg_house_worth|     sum_house_age|    sum_house_size|   sum_house_worth|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|     26| 333.1296441436684| 30.61181421799333|299.42218274902274| 32979.83477022317|  3030.56960758134| 29642.79609215325|
|     29|300.66514391677435|  28.3565826537405|265.27163263740397| 29765.84924776066|2807.3016827203096|26261.891631102993|
|     19| 539.9744360853701| 42.61610624706339| 461.2294529010087|53457.469172451645| 4218.994518459276| 45661.71583719986|
|     22|355.27561487081687| 37.87464176584263|377.56902176073646| 35172.28587221087|3749.5895348184204| 37379.33315431291|
|      7|1179.1468181061875|106.41616578898419|1140.3828069179945|116735.53499251256|10535.200413109435|112897.89788488146|
+-------

In [7]:
houses_for_sale_features_df.printSchema()

root
 |-- area_id: long (nullable = true)
 |-- avg_house_age: double (nullable = true)
 |-- avg_house_size: double (nullable = true)
 |-- avg_house_worth: double (nullable = true)
 |-- sum_house_age: double (nullable = true)
 |-- sum_house_size: double (nullable = true)
 |-- sum_house_worth: double (nullable = true)

In [8]:
featurestore.create_featuregroup(
    houses_for_sale_features_df,
    "houses_for_sale_featuregroup",
    description="aggregate features of houses for sale per area",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)

Running sql: use demo_deep_learning_admin000_featurestore
Feature group created successfully