In [0]:
#@ install Spark and dependency
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
!pip install folium



In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Google Drive Mount

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Dataset Path

---

/content/drive/Shared drives/718_project/dataset

In [0]:
cd /content/drive/Shared drives/718_project/dataset

/content/drive/Shared drives/718_project/dataset


In [0]:
ls

calendar.csv       listings.csv  [0m[01;34mspark-2.4.4-bin-hadoop2.7[0m/
listing_clean.csv  reviews.csv   spark-2.4.4-bin-hadoop2.7.tgz


Data Path

---
/content/drive/Shared drives/718_project/dataset/listings.csv


Load Module

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.functions import isnan, isnull, when, count, col
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from  pyspark.sql.functions import regexp_replace

Create data repository decorator

In [0]:
import csv
import pandas as pd

from collections.abc import Iterable
import pyspark.sql.functions as fn

def train_test_dataset_repository(path, target, features=None):
  param_features = features
  if target is None:
    raise AttributeError("Parameter 'target' missing! ")
  def build_dataset(func):
    def wrapper_func(self):
      # Raw data
      raw_data = spark.read\
        .option("header", "true")\
        .option("multiLine", "true")\
        .option('inferSchema', 'true')\
        .option('escape', '"')\
        .csv(path)
      columns = raw_data.schema.names

      # Select features
      features = param_features
      if features is None:
        features = columns
      if not isinstance(features, Iterable):
        features = [features]
      features.remove(target)

      # Recording raw_data
      self._raw_data = raw_data
      # Recording configuration
      self._config = dict({
          'path': path,
          'features': features,
          'target': target,
      })

      # Build X, Y
      return reset_dataset_from_raw(func)(self)
    return wrapper_func
  return build_dataset

def reset_dataset_from_raw(func):
  def wrapper_func(self):
    raw_data = self._raw_data
    # Restore config
    features = self._config['features']
    target = self._config['target']

    
    X = raw_data.select([fn.col(col_name) for col_name in features])
    Y = raw_data.select(fn.col(target))
    self._X = X
    self._Y = Y

    train_data, test_data = raw_data.randomSplit([0.7, 0.3])
    self._train_X = train_data.select([fn.col(col_name) for col_name in features])
    self._train_Y = train_data.select(fn.col(target))
    self._test_X = test_data.select([fn.col(col_name) for col_name in features])
    self._test_Y = test_data.select(fn.col(target))

    return func(self)
  return wrapper_func



Create data object, fill the raw/train/test data.

In [0]:
def warning(msg):
  flag = True
  def inject(func):
    def wrapper_func(self):
      x = 'Y'
      if flag:
        x = input(f"{func.__name__}是{msg}，确定要用？(Y/N)")
      if 'Y' == x:
        return func(self)
      raise PermissionError(f"{func.__name__}是{msg}，不能随便用！")
    return wrapper_func
  return inject

class Data:
  @train_test_dataset_repository('/content/drive/Shared drives/718_project/dataset/listing_clean.csv', 'price')
  def __init__(self):
    pass
  @property
  @warning("原始未分割数据")
  def raw_data(self):
    return self._raw_data
  @property
  @warning("原始自变量")
  def X(self):
    return self._X
  @property
  @warning("原始应变量")
  def Y(self):
    return self._Y
  @property
  def train_X(self):
    return self._train_X
  @property
  def train_Y(self):
    return self._train_Y
  @property
  @warning("测试自变量")
  def test_X(self):
    return self._test_X
  @property
  @warning("测试应变量")
  def test_Y(self):
    return self._test_Y



# EDA and Data Imputation

We will explore the whole dataset during the EDA. 

In [0]:
from pyspark.sql.dataframe import DataFrame

df = Data().raw_data

raw_data是原始未分割数据，确定要用？(Y/N)Y


In [0]:
# 数据信息查看
df_pd = df.toPandas()
df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7628 entries, 0 to 7627
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   price   7628 non-null   float32
 1   price   7628 non-null   float32
dtypes: float32(2)
memory usage: 59.7 KB


## Numerical Data Cleaning

### Convert values in columns from string to number

- Display the type of each column

In [0]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- experiences_offered: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: double (nullable = true)
 |-- host_is_superhost: integer (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_listings_count: integer (nullable = true)
 |-- host_verifications: string (nullable = true)
 |-- host_identity_verified: integer (nullable = true)
 |-- street: string (nullable = tr

- Select boolean columns

In [0]:
bool_columns = ['host_is_superhost', 
                    'host_has_profile_pic', 
                    'host_identity_verified', 
                    'is_location_exact', 
                    'has_availability', 
                    'requires_license', 
                    'instant_bookable', 
                    'is_business_travel_ready', 
                    'require_guest_profile_picture', 
                    'require_guest_phone_verification']
df_bool = df.select(bool_columns)
df_bool.limit(5).toPandas().T

AnalysisException: ignored

- Replace t to 1, and f to 0 respectively

In [0]:
from pyspark.sql import functions as fn
from pyspark.sql import types as t

bool_dict = {'t': 1, 'f': 0}
def bool_map(x):
  if x in bool_dict.keys():
    return bool_dict[x]
  return x

bool_encode_udf = fn.udf(bool_map, t.IntegerType())

for col_name in bool_columns:
  df = df.withColumn(col_name, bool_encode_udf(fn.col(col_name)))

df.select([fn.col(col_name) for col_name in bool_columns]).limit(5).toPandas().T


AnalysisException: ignored

### Convert values in columns from formatted string to number

- Select price formatted columns

In [0]:
price_columns = ['extra_people', 
                 'price']
df_price = df.select(price_columns)
df_price.limit(5).toPandas().T

Unnamed: 0,0,1,2,3,4
extra_people,$25.00,$5.00,$10.00,$15.00,$15.00
price,$296.00,$48.00,$90.00,$62.00,$99.00


 - Reformat the currency formattet to number

In [0]:
for col_name in price_columns:
  df = df.withColumn(col_name, fn.regexp_replace(fn.col(col_name), "\$|," , '' ))

df.select([fn.col(col_name) for col_name in price_columns]).limit(5).toPandas().T

Unnamed: 0,0,1,2,3,4
extra_people,25.0,5.0,10.0,15.0,15.0
price,296.0,48.0,90.0,62.0,99.0


In [0]:
#Cth's code 将price 从string type 转变为 numeric type
df = df.withColumn('price', df.price.cast('float'))

In [0]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- experiences_offered: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: double (nullable = true)
 |-- host_is_superhost: integer (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_listings_count: integer (nullable = true)
 |-- host_total_listings_count: integer (nullable = true)
 |-- host_verifications: string (nullable = true)
 |-- host_identity_verified: integer (nullable = true)
 |-- street: string (nullable = tr

## Dummy Variable

## Zero Variance or low Variance Variables Removal

In [0]:
#CTH's code
drop_list_low_variance = []
def low_variance(col,df):
  var = df.agg({col : 'variance'})
  if var.collect()[0][0] < 0.005:
    drop_list_low_variance.append(col)
    return var.show()
for i in bool_columns:
    low_variance(i,df)
drop_list_low_variance

In [0]:
#var = df.agg({'is_business_travel_ready': 'variance'}).show()
#df.drop('is_business_travel_ready')

#df.agg({'host_has_profile_pic' : 'variance'}).show()
#df.drop('host_has_profile_pic')

## Missing value exploratory

- Identify the number of missing value at each feature. 

In [0]:
def calc_missing_ratio(data_frame: DataFrame):
  record_cnt = data_frame.count()
  df_columns = data_frame.columns
  
  df_result = data_frame.select([fn.col(c).cast(t.StringType()) for c in data_frame.columns]) \
    .select([fn.sum(fn.when(fn.isnull(c), 1).otherwise(0)).alias(c) for c in data_frame.columns]) \
    .select([(col(c)/record_cnt).alias(c) for c in data_frame.columns]) \
    .toPandas().T

  df_result = df_result.loc[(df_result != 0).all(axis=1), :]
  df_result.columns = ['Missing Value Ratio']
  df_result.sort_values(by=['Missing Value Ratio'], ascending=False, inplace=True)

  return df_result

missing_ratio = calc_missing_ratio(df)

print(missing_ratio)

- Drop columns that doesn't have value at all

In [0]:
def shape(data_frame: DataFrame):
  return (data_frame.count(), len(data_frame.columns))

empty_column = list(missing_ratio.loc[(missing_ratio == 1).all(axis=1), :].index)

df = df.drop(*empty_column)
shape(df)

- Visualize the ratio of missing values using bar plot.<br>
We will focus on the columns that has more than 2% of missing values.

In [0]:
import math
from matplotlib import gridspec
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import matplotlib.ticker as mtick


def plot_missing_freq(df, xlab, ylab, title):
  plt.clf()
  plt.figure(figsize=(20, 5))
  df = pd.Series(df.iloc[:, 0].values, index=list(df.index))

  ax = sns.barplot(x=df.values * 100, y=df.index, orient='h')
  
  ax.set_xticklabels(df.values * 100, fontsize=15)
  ax.set_yticklabels(df.index, fontsize=12)
  ax.xaxis.set_major_formatter(mtick.PercentFormatter())

  ax.set_xlabel(xlab, fontsize=15)
  ax.set_ylabel(ylab, fontsize=35)

  plt.title(title, fontsize=20)
  # for bar in ax.patches:
  #   bar.set_height(30)

  display()

missing_ratio_2 = calc_missing_ratio(df)
missing_ratio_2 = missing_ratio_2.loc[(missing_ratio_2 > 0.02).all(axis=1), :]
plot_missing_freq(missing_ratio_2, 'Feature', 'Missing Value Ratio', 'Airbnb Data Set Missing Value')

In [0]:
#Mark
pred_list = ['id','host_is_superhost','neighbourhood_group_cleansed', 'property_type', 
             'room_type', 'latitude', 'longitude', 'guests_included', 'bathrooms', 
             'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'cleaning_fee', 
             'instant_bookable', 'cancellation_policy']

In [0]:
df.select(pred_list).show()

In [0]:
pred_df = df.select(pred_list)
pred_df  = pred_df.na.drop()
pred_df.toPandas()

In [0]:
#Dummy Variable
import pyspark.sql.functions as F 

dummy_list = ['neighbourhood_group_cleansed',
       'property_type', 'room_type', 'bed_type','cancellation_policy']

def dummy_convert(dummy_list,df):
    for i in dummy_list:
        categ = df.select(i).distinct().rdd.flatMap(lambda x:x).collect()
        exprs = [F.when(F.col(i) == j,1).otherwise(0).alias(str(j)) for j in categ]
        df = df.select(exprs+df.columns)
    return df

dummy_convert(dummy_list,pred_df).toPandas()
    

In [0]:
#CTH's code 将一些字符串形式的数字中的N/A 用平均数代替 
df = df.withColumn('host_response_rate', regexp_replace('host_response_rate', '%', ''))
df = df.withColumn('host_response_rate', df['host_response_rate'].cast("double"))
mean = df.agg({'host_response_rate': 'mean'}).collect()[0][0]
df = df.fillna({'host_response_rate': mean})

In [0]:
# CTH's code
# drop_list1是一些所有行或者大部分行都缺失的列和一些无用的列
drop_list1 = ['license', 'scrape_id','last_scraped',
                ,'host_acceptance_rate','thumbnail_url','medium_url','xl_picture_url'
                ,'square_feet','weekly_price','monthly_price'
                ,'host_neighbourhood','zipcode','security_deposit'
                , 'cleaning_fee','summary',]

# drop_list2中的列有大量缺失值,都是文字性信息
drop_list2 = ['notes','access','transit','space','neighborhood_overview','interaction','house_rules','host_about']

# drop_list_low_variance = ['host_has_profile_pic','has_availability','requires_license','is_business_travel_ready']
# low variance 关于boolean的四个要删除的columns

df = df.drop(*drop_list1)
df = df.drop(*drop_list2)
df = df.drop(*drop_list_low_variance)
print(df.count(),len(df.columns))

In [0]:
# CTH's code
#查看drop部分columns以后NA的情况
i = 0
j = 10
for k in range(9):
    df.select([count(when(isnull(c), c)).alias(c) for c in df.columns[i:j]]).show()
    i +=10
    j +=10

In [0]:
# CTH's code
# subset中的列缺失值大于600个， drop掉这12列中缺失值数量大于等于7个的列（也就是超过一半都是NA的行）
df = df.dropna(subset=["first_review","last_review","review_scores_rating","review_scores_accuracy"
                    ,"review_scores_cleanliness","review_scores_checkin","review_scores_communication"
                   ,"review_scores_location","review_scores_value"
                    ],thresh=5)
print(df.count(),len(df.columns))

In [0]:
# CTH's code
#查看第二次drop后NA的情况
i = 0
j = 10
for k in range(8):
    df.select([count(when(isnull(c), c)).alias(c) for c in df.columns[i:j]]).show()
    i +=10
    j +=10

In [0]:
# CTH's code
# 第三次drop 将一些没有很少量缺失的值删除，得到一个干净的数据集
df = df.dropna()

In [0]:
# CTH's code
#查看第三次drop后NA的情况
i = 0
j = 10
for k in range(8):
    df.select([count(when(isnull(c), c)).alias(c) for c in df.columns[i:j]]).show()
    i +=10
    j +=10
print(df.count(),len(df.columns))

In [0]:
# CTH's code  将清理后的输入以csv格式存储在指定的地址中
df_pd = df.toPandas()

In [0]:
#将清洗后的数据保存到指定路径中
df_pd.to_csv("/content/drive/Shared drives/718_project/dataset/listing_clean.csv")

我觉得dummy variable 的建立用010101创建很多列效率不如StringIndex高
建议为有需要用到的categorical cols 创建 String Indexer

In [0]:
#CTH's code
categorical_cols = ['neighbourhood_group_cleansed',
       'property_type', 'room_type', 'bed_type','cancellation_policy','calendar_updated']

indexers = [StringIndexer(inputCol=col, outputCol = col + "_IDX")\
            .setHandleInvalid("keep") for col in categorical_cols]
indexer_pipeline = Pipeline(stages=indexers)
#df_categorical = df.select(categorical_cols)
df_c_transformed = indexer_pipeline.fit(df).transform(df)
df_c_transformed.show(1)

+---+----+--------------------+--------------------+--------------------+-------------------+--------------------+-------+--------------------+---------+-------------------+--------------------+------------------+------------------+-----------------+--------------------+--------------------+-------------------+-------------------------+--------------------+----------------------+--------------------+-------------+----------------------+----------------------------+-------+-----+-------+--------------+------------+-------------+--------+----------+-----------------+-------------+---------------+------------+---------+--------+----+--------+--------------------+------+---------------+------------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+------

在通过ont hot encoding 将这个categorical variables 整合成一个vector 作为一个feature


In [0]:
##CTH's code
encoded = [OneHotEncoder(inputCol = col + "_IDX", outputCol = col + 'Vec') for col in categorical_cols]

In [0]:
#CTH's code
encoded_pipeline = Pipeline(stages = encoded)
df_encoded = encoded_pipeline.fit(df_c_transformed).transform(df_c_transformed)
df_encoded.show(1)

+---+----+--------------------+--------------------+--------------------+-------------------+--------------------+-------+--------------------+---------+-------------------+--------------------+------------------+------------------+-----------------+--------------------+--------------------+-------------------+-------------------------+--------------------+----------------------+--------------------+-------------+----------------------+----------------------------+-------+-----+-------+--------------+------------+-------------+--------+----------+-----------------+-------------+---------------+------------+---------+--------+----+--------+--------------------+------+---------------+------------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+------

In [0]:
#文字性列的处理 CTH's code
from pyspark.sql.functions import array

df =df.withColumn('host_verifications', array(df['host_verifications']))
df =df.withColumn('amenities', array(df['amenities']))

In [0]:
#文字性列的处理 CTH's code
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

cv1 = CountVectorizer()\
    .setInputCol('host_verifications')\
    .setOutputCol('tf1')

cv2 = CountVectorizer()\
    .setInputCol('amenities')\
    .setOutputCol('tf2')

idf1 = IDF().\
    setInputCol("tf1").\
    setOutputCol("tfidf1")

idf2 = IDF().\
    setInputCol("tf2").\
    setOutputCol("tfidf2")

In [0]:
#CTH'code
#创建随机森林回国模型
from pyspark.ml.regression import RandomForestRegressor
regression = RandomForestRegressor(labelCol='price')
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(labelCol='price')

In [0]:
#cth's code
#feature的集合
from pyspark.ml.feature import VectorAssembler
list_numeric = ['host_is_superhost','host_listings_count','tfidf1','tfidf2'
                ,'host_identity_verified','accommodates','bathrooms','bedrooms'
                ,'beds','guests_included','number_of_reviews','number_of_reviews_ltm'
                ,'review_scores_rating','review_scores_accuracy','review_scores_cleanliness'
                ,'review_scores_checkin','review_scores_communication','review_scores_location'
                ,'review_scores_value','calculated_host_listings_count','calculated_host_listings_count_entire_homes'
                ,'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','reviews_per_month'
                ,'neighbourhood_group_cleansedVec','property_typeVec', 'room_typeVec', 'bed_typeVec'
                ,'cancellation_policyVec','calendar_updatedVec']
assemble = VectorAssembler(inputCols=list_numeric,outputCol='features')

In [0]:
transfomer1 = Pipeline(stages=indexers + encoded + [cv1,idf1,cv2,idf2,assemble,regression])
transfomer2=  Pipeline(stages=indexers + encoded + [cv1,idf1,cv2,idf2,assemble,gbt])

In [0]:
#Cth's code
#训练数据集，并预测价格
df_train,df_test = df.randomSplit([0.8,0.2],seed=13)

In [0]:
model_forest = transfomer1.fit(df_train).transform(df_test)
model_gbt = transfomer2.fit(df_train).transform(df_test)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

# Select columns to compute test error
evaluator = RegressionEvaluator(labelCol='price')
# Dictionary of model predictions to loop over
models = {'Gradient Boosted Trees': model_gbt, 'Random Forest Regression': model_forest}
for key, preds in models.items():
  # Create evaluation metrics
  rmse = evaluator.evaluate(preds, {evaluator.metricName: 'rmse'})
  r2 = evaluator.evaluate(preds, {evaluator.metricName: 'r2'})
  
  # Print Model Metrics
  print(key + ' RMSE: ' + str(rmse))
  print(key + ' R^2: ' + str(r2))

Gradient Boosted Trees RMSE: 78.41470390500861
Gradient Boosted Trees R^2: 0.7633095588808593
Random Forest Regression RMSE: 86.41012604363834
Random Forest Regression R^2: 0.7125813252926572


## Statistic description
## According to the host_response_rate and host_response_time columns, They have 1906 NA, we need to delete them, and draw a histgram and barplot to see the distribution of rate, we find most of hosts reply after receive messages and the time of reply is less than one hour.The mean_response_rate is 98.09%.



## Outlier Visualization

## Map

In [0]:
# import folium
# from folium import plugins

# data = Data().X.toPandas()

# incidents = folium.map.FeatureGroup()
# for lat, lng in zip(data.latitude, data.longitude):
#     incidents.add_child(
#       folium.CircleMarker(
#       [lat,lng],
#       radius=3,
#       color='yellow',
#       fill= True,
#       fill_color='red',
#       fill_opacity=0.4
#       )
# )

seattle_map = folium.Map(location=[data.latitude.mean(), data.longitude.mean()], zoom_start=12)
# incidents = plugins.MarkerCluster().add_to(seattle_map)
# for lat, lng, label in zip(data.latitude, data.longitude, data.price):
#     folium.Marker(
#         location=[lat, lng],
#         icon=None,
#         popup=label).add_to(incidents)

# seattle_map.add_child(incidents)
# seattle_map


# BarPlot

In [0]:
#CTH's code
host_response_time = df_pd['host_response_time']
print(host_response_time.value_counts())
col = ['#FF8247','#CD919E','#7EC0EE','#8B0000','#EEEE00',]
x = ['within an hour', 'within a few hours','within a day', 'a few days or more','N/A']
y = [5283,630,237,34,1444]  
plt.figure(figsize=(10,8),dpi = 80)
plt.xlabel('Time',fontsize = 18)
plt.ylabel('Frequency',fontsize = 18)
plt.title('Time of Response',fontsize = 20)
plt.bar(range(len(x)),y,width=0.5,color = col)
plt.xticks(range(len(x)),x)
plt.tick_params(labelsize=13)
plt.show()

In [0]:
#CTH's code
host_is_superhost = df_pd['host_is_superhost']
print(host_is_superhost.value_counts())
x = ['Superhost','Not Superhost']
y = [3745,5274]
plt.figure(figsize=(10,8),dpi = 80)
plt.ylabel('Frequency',fontsize = 18)
plt.title('Whether the host is superhost',fontsize = 20)
plt.bar(range(len(x)),y,width=0.2,color = 'orange')
plt.xticks(range(len(x)),x)
plt.tick_params(labelsize=15)
plt.show()

In [0]:
#CTH's code
property_type = df_pd['property_type'].dropna(axis=0,how='any')
property_type = property_type.value_counts()
property_type.plot(kind='bar',figsize=(10,8), color='orange',width = 0.8)
plt.xlabel('Property Type',fontsize = 18)
plt.ylabel('Frequency',fontsize = 18)
plt.title('Distribution of Property Type', fontsize = 20)
plt.tick_params(labelsize=15)
plt.show()

In [0]:
#CTH's code
room_type = df_pd['room_type'].dropna(axis=0,how='any')
room_type = room_type.value_counts()
room_type.plot(kind='bar',figsize=(10,8), color='orange', width = 0.8)
plt.xlabel('Room Type',fontsize = 18)
plt.ylabel('Frequency',fontsize = 18)
plt.title('Distribution of Room Type', fontsize = 20)
plt.tick_params(labelsize=15)
plt.xticks(rotation = 0)
plt.show()

### Histogram

In [0]:
#CTH's code
host_response_rate = df_pd['host_response_rate']
plt.figure(figsize=(10,8),dpi = 80)
plt.hist(host_response_rate,10,color='orange')
plt.xlabel('host_response_rate(%)',fontsize = 18)
plt.ylabel('frequency',fontsize = 18)
_xtick_labels = [i for i in range(0,101)]
plt.xticks(_xtick_labels[::10])
plt.title('Response rate of host',fontsize = 20)
plt.tick_params(labelsize=15)
plt.show()

In [0]:
#get list of neighbourhoods
neighbourhoods = df['neighbourhood_group_cleansed'].unique()

#get prices by month and neighbourhood
price_by_month_neighbourhood = df.groupby(['month','neighbourhood_group_cleansed']).mean().reset_index()

#plot prices for each neighbourhood
fig = plt.figure(figsize=(20,10))
ax = plt.subplot(111)

for neighbourhood in neighbourhoods:
    ax.plot(price_by_month_neighbourhood[price_by_month_neighbourhood['neighbourhood_group_cleansed'] == neighbourhood]['month'],
             price_by_month_neighbourhood[price_by_month_neighbourhood['neighbourhood_group_cleansed'] == neighbourhood]['price'],
             label = neighbourhood)
    
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.ylabel('Average price, $')
plt.xlabel('Month')
plt.title('Average price for neighbourhood, $')

plt.savefig('average price for neighbourhood')

plt.show()


### Boxplot

In [0]:
# Mark
data = Data().raw_data.toPandas()

In [0]:
data.price  = data.price.dropna(axis=0,how='any')
data.price = data.price.str.replace(',','')
data.price = data.price.str.replace('$','')
data.price = pd.to_numeric(data.price)
sns.set(style="whitegrid")
sns.boxplot(x=data.price )

In [0]:
data.security_deposit  = data.security_deposit.dropna(axis=0,how='any')
data.security_deposit = data.security_deposit.str.replace(',','')
data.security_deposit = data.security_deposit.str.replace('$','')
data.security_deposit = pd.to_numeric(data.security_deposit)
sns.set(style="whitegrid")
sns.boxplot(x=data.security_deposit )

In [0]:
data.cleaning_fee  = data.cleaning_fee.dropna(axis=0,how='any')
data.cleaning_fee = data.cleaning_fee.str.replace(',','')
data.cleaning_fee = data.cleaning_fee.str.replace('$','')
data.cleaning_fee = pd.to_numeric(data.cleaning_fee)
sns.set(style="whitegrid")
sns.boxplot(x=data.cleaning_fee )

In [0]:
data = Data().raw_data.toPandas()

In [0]:
price_list = ['price', 'security_deposit', 'cleaning_fee']
for i in price_list:
  i  = data[i].dropna(axis=0,how='any')
  i = i.str.replace(',','')
  i = i.str.replace('$','')
  i = pd.to_numeric(i)
  sns.set(style="whitegrid")
  sns.boxplot(x=i)


In [0]:
list = ['host_listings_count', 
 'host_total_listings_count', 
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'guests_included',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'number_of_reviews',
 'number_of_reviews_ltm',
 ]


In [0]:
for i in list:
  title = i
  i = data[i].dropna(axis=0,how='any')
  sns.set(style="whitegrid")
  sns.boxplot(x=i)
  plt.show()

### PairPlot

## Correlation Visualization

In [0]:
data = Data().raw_data.toPandas()

In [0]:
#col_list = []
sns.heatmap()
cols = ['accommodates','bathrooms','bedrooms','beds','host_since_year',
        'host_listings_count', 'extra_people_fee',
        'review_scores_rating', 'price']

#Find out correlation between columns and plot
corrs = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1)
sns.set(rc={'figure.figsize':(7,7)})
hm=sns.heatmap(corrs, cbar = True, annot=True, square = True, fmt = '.2f',
              yticklabels = cols, xticklabels = cols).set_title('Correlations heatmap')

fig = hm.get_figure()

## Domain knowledge, knowledge research

# Data Wrangling

Constructing analyzing pipeline

In [0]:
from pyspark.ml import Pipeline, Transformer
from pyspark.sql import DataFrame


data = Data()
train_X = data.train_X
train_Y = data.test_Y
print(train_Y)


自定义管道，可用作验证时管道的复用

In [0]:
# 这是个自定义管道的例子
class CustomTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df


总体分析步骤，你们挑感兴趣的步骤写

In [0]:
class LowVarianceFilter(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

空值分析

In [0]:
class MissingValueTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

字段筛选

In [0]:
class DropColumnTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

One-Hot Encoding

In [0]:
# 这个其实官方库已经有了，只需要找到网上的例子搬过来
class OneHotTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

Outlier Handling
1. Drop

In [0]:
class OutlierDropTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

2. Replace with mean

In [0]:
class OutlierReplaceMeanTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

3. Replace with median

In [0]:
class OutlierReplaceMedianTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

4. Randomly replae with new value(using normal distribution)

5. Logarithm encoding

In [0]:
class OutlierReplaceRandomNormalDistTransformer(Transformer):
  def _transform(self, df: DataFrame) -> DataFrame:
    return df

Scaling
1. Min-max Scaling

In [0]:
from pyspark.ml.feature import MinMaxScaler

#scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

#scalerModel = scaler.fit(dataFrame)

2. Standardize Scaling

In [0]:
from pyspark.ml.feature import StandardScaler

#scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
 #                       withStd=True, withMean=False)

Build actual pipeline

In [0]:
trans_pipe = Pipeline(stages=[
    LowVarianceFilter(),
    # MissingValueTransformer(),
    # feature.VectorAssembler(inputCols=['col1', 'col2', 'col3', 'col4'], outputCol='features'),
    # regression.LinearRegression(featuresCol='features', labelCol='abc')
])

display(trans_pipe.fit(train_X).transform(train_X).count())


In [0]:
train_X.show()

# Performance Evaluation

## Lasso

# Concolusion

\begin{equation} L\theta^{\lambda}(p(X),Y) = -\left( \sum_i Y_i \log p\theta(Xi) + (1-Y_i)\log(1-p\theta(Xi)) \right) + \lambda \sum{j>0} \left| \theta_j \right| \end{equation}