In [1]:
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.{Word2Vec, Word2VecModel}

Intitializing Scala interpreter ...

Spark Web UI available at http://DESKTOP-2NBBC4T:4040
SparkContext available as 'sc' (version = 2.4.3, master = local[*], app id = local-1562965581914)
SparkSession available as 'spark'


import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.{Word2Vec, Word2VecModel}


In [2]:
val inputCol = "text"
val maxIter = 1
val minCount = 10
val numPartitions = 1
val seed = 42
val vectorSize = 15

val debug = false

inputCol: String = text
maxIter: Int = 1
minCount: Int = 10
numPartitions: Int = 1
seed: Int = 42
vectorSize: Int = 15
debug: Boolean = false


In [3]:
def printRDD[T] ( rdd:RDD[T], n:Int = 0 ) : Unit = {
    if(n != 0) {
    rdd.take(n).foreach(println)
} else {
    rdd.collect().foreach(println)}
}

printRDD: [T](rdd: org.apache.spark.rdd.RDD[T], n: Int)Unit


In [4]:
var data = sc.textFile("data/item_metadata.csv")

data: org.apache.spark.rdd.RDD[String] = data/item_metadata.csv MapPartitionsRDD[1] at textFile at <console>:27


In [5]:
printRDD(data, 10)

item_id,properties
5101,Satellite TV|Golf Course|Airport Shuttle|Cosmetic Mirror|Safe (Hotel)|Telephone|Hotel|Sitting Area (Rooms)|Reception (24/7)|Air Conditioning|Hypoallergenic Rooms|Cable TV|Hotel Bar|Pool Table|Bathtub|Satisfactory Rating|Room Service|Luxury Hotel|Terrace (Hotel)|Television|Minigolf|Business Hotel|Shower|Cot|Gym|Hairdryer|Hypoallergenic Bedding|Accessible Parking|From 3 Stars|Good Rating|Radio|4 Star|From 4 Stars|Family Friendly|Desk|Tennis Court (Indoor)|Balcony|WiFi (Public Areas)|Openable Windows|Express Check-In / Check-Out|Restaurant|Laundry Service|Ironing Board|Tennis Court|From 2 Stars|Business Centre|Bowling|Conference Rooms|Electric Kettle|Accessible Hotel|Porter|Bike Rental|Non-Smoking Rooms|Car Park|Safe (Rooms)|Fitness|Fan|Flatscreen TV|Computer with Internet|WiFi (Rooms)|Lift|Central Heating
5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Telephone|Hotel|Sitting Area (Rooms)|Reception (24/7)|Wheelchair Accessible|Hypoallergenic Rooms|Hotel Bar|Bathtub

In [6]:
val header = data.first()
data = data.filter(row => row != header)

header: String = item_id,properties
data: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at <console>:29


In [7]:
if(debug) {
data = data.sample(false, 0.001, 42)
}

In [8]:
data.count()

res2: Long = 927142


In [9]:
val truncated_data = data.map(_.split(',')(1))

truncated_data: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at map at <console>:28


In [10]:
val preprocessed_Data = truncated_data.map(_.split('|')).map(Tuple1.apply).toDF("text")

preprocessed_Data: org.apache.spark.sql.DataFrame = [text: array<string>]


In [11]:
val words = truncated_data.flatMap(_.split('|')).distinct()

words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[9] at distinct at <console>:28


In [12]:
printRDD(words, 10)
words.count()

Direct beach access
Guest House
Ironing Board
Bowling
Towels
Horse Riding
Reception (24/7)
Cot
2 Star
Cosmetic Mirror


res3: Long = 157


In [13]:
preprocessed_Data.head(3)
preprocessed_Data.count()

res4: Long = 927142


In [14]:
def getModel() : Word2Vec = {
    var word2vec = new Word2Vec()
    word2vec.setInputCol(inputCol)
    word2vec.setMaxIter(maxIter)
    word2vec.setMinCount(minCount)
    word2vec.setNumPartitions(numPartitions)
    word2vec.setSeed(seed)
    word2vec.setVectorSize(vectorSize)
    return word2vec;
}

getModel: ()org.apache.spark.ml.feature.Word2Vec


In [15]:
val model = getModel().fit(preprocessed_Data)

model: org.apache.spark.ml.feature.Word2VecModel = w2v_f6bf20b6f3e3


In [16]:
def getVectors(model:Word2VecModel) : Map[String, Array[Float]] = {
    var map = Map[String, Array[Float]]()
    for(x <- words) {
        map ++ x -> model.getVectors(x)
    }
    return map
}

getVectors: (model: org.apache.spark.ml.feature.Word2VecModel)Map[String,Array[Float]]


In [17]:
model.findSynonyms("Gym", 10).take(10)

res5: Array[org.apache.spark.sql.Row] = Array([Swimming Pool (Outdoor),0.9618108868598938], [Spa Hotel,0.94759202003479], [Ski Resort,0.945956826210022], [Health Retreat,0.9367016553878784], [Nightclub,0.9317147731781006], [Cot,0.930616557598114], [Casino (Hotel),0.922082245349884], [Szep Kartya,0.8853012919425964], [Beach,0.8825615644454956], [Hypoallergenic Bedding,0.8758768439292908])


In [18]:
model.write.overwrite.save("word2vec-model-highdimensional")

In [19]:
model.getVectors.rdd.saveAsTextFile("vectors-highdimensional")