<a href="https://colab.research.google.com/github/jianfeiZhao/BI_projs/blob/master/pyspark_hotel_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install pyspark



In [29]:
# 使用pyspark计算文档的TFIDF
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
#from pyspark.ml.feature import NGram

# 创建SparkSession，2.0版本之后只需要创建一个SparkSession即可
spark=SparkSession \
        .builder \
        .appName('tfidf_app') \
        .getOrCreate()

# 加载数据
df0 = spark.read.csv("Seattle_Hotels.csv", header=True, inferSchema=True)
df0.show(6)
print('数据集中的酒店个数：',df0.count())

+---+--------------------+--------------------+--------------------+
| id|                name|             address|                desc|
+---+--------------------+--------------------+--------------------+
|  1|Hilton Garden Sea...|1821 Boren Avenue...|Located on the so...|
|  2|Sheraton Grand Se...|1400 6th Avenue, ...|Located in the ci...|
|  3|Crowne Plaza Seat...|1113 6th Ave, Sea...|Located in the he...|
|  4|Kimpton Hotel Mon...|1101 4th Ave, Sea...|What?s near our h...|
|  5|  The Westin Seattle|1900 5th Avenue,�...|Situated amid inc...|
|  6|The Paramount Hot...|724 Pine Street, ...|More than just a ...|
+---+--------------------+--------------------+--------------------+
only showing top 6 rows

数据集中的酒店个数： 152


In [30]:
# 将desc分词
tokenizer = Tokenizer(inputCol='desc', outputCol='desc_words')
df = tokenizer.transform(df0)
#df.show()
#df.select('desc_words').show(6)

# 去停用词
stopwords = ['the', 'of', 'in', 'a', 'an', 'at', 'as', 'on', 'for',\
       'it', 'we', 'you', 'want', 'up', 'to', 'if', 'are', 'is',\
        'and', 'our', 'with', 'from', '-', 'your', 'so']
stopwords_remover = StopWordsRemover(inputCol='desc_words', outputCol='desc_remv').setStopWords(stopwords)
df = stopwords_remover.transform(df)

# 计算TF-IDF
hashingTF = HashingTF(inputCol='desc_remv', outputCol='desc_words_tf')
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf)
tfidf = idf.transform(tf).cache()
#print('每个酒店的tfidf', tfidf.select('desc_words_tfidf').show(6,truncate=False))

# 数据规范化
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm")
tfidf = normalizer.transform(tfidf)
#tfidf.select("id", "norm").show(6)

# 计算各个酒店之间的相似度
import pyspark.sql.functions as psf 
from pyspark.sql.types import DoubleType
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType()) # 自定义点乘函数
tfidf = tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") < psf.col("a2.id"))\
        .select(
            psf.col("a1.name"),
            psf.col("a1.id").alias("id1"), 
            psf.col("a2.id").alias("id2"), 
            dot_udf("a1.norm", "a2.norm").alias("similarity"))\
        .sort("id1", "id2")
tfidf.show(10)

+--------------------+---+---+--------------------+
|                name|id1|id2|          similarity|
+--------------------+---+---+--------------------+
|Hilton Garden Sea...|  1|  2|0.027762951889516864|
|Hilton Garden Sea...|  1|  3| 0.03293087162593186|
|Hilton Garden Sea...|  1|  4|0.016296905517573637|
|Hilton Garden Sea...|  1|  5|0.061953533294186125|
|Hilton Garden Sea...|  1|  6|0.026031641494299346|
|Hilton Garden Sea...|  1|  7| 0.06286237883702435|
|Hilton Garden Sea...|  1|  8|0.030473799292964114|
|Hilton Garden Sea...|  1|  9|0.014074389630803305|
|Hilton Garden Sea...|  1| 10| 0.04236071927221931|
|Hilton Garden Sea...|  1| 11| 0.03481589805978849|
+--------------------+---+---+--------------------+
only showing top 10 rows



In [41]:
# 基于相似度和指定的酒店name，推荐TOP10酒店
def recommendations(name):
    temp = tfidf.where('name="'+name+'"').sort('similarity', ascending=False).limit(10)
    temp = temp.drop('name')
    rec = df0.join(temp, df0.id==temp.id2).sort('similarity', ascending=False)
    return rec.select('id','name','address','desc','similarity').show()

rec = recommendations('Hilton Seattle Airport & Conference Center')
rec = recommendations('The Bacon Mansion Bed and Breakfast')

+---+--------------------+--------------------+--------------------+-------------------+
| id|                name|             address|                desc|         similarity|
+---+--------------------+--------------------+--------------------+-------------------+
| 63|Embassy Suites by...|15920 W Valley Hw...|The Embassy Suite...| 0.1292847349555283|
| 53|DoubleTree by Hil...|18740 Internation...|Welcome to Double...|0.11942642456944962|
|104|Four Points by Sh...|601 Roy St, Seatt...|Where the Action ...|0.09185925317552458|
| 58|Red Lion Hotel Se...|18220 Internation...|Welcome to Red Li...|0.09126698760167487|
| 72|Econo Lodge SeaTa...|13910 Tukwila Int...|Our Econo Lodge�S...|0.07586577219261216|
| 52|Radisson Hotel Se...|18118 Internation...|Located across th...|0.07292031513026236|
|147|Hampton Inn Seatt...|7200 S 156th St, ...|Stay in comfort a...|0.07108287270117035|
| 57|Best Western Seat...|20717 Internation...|Enjoy convenient ...| 0.0698914928810006|
| 64|Home2 Suites by 