In [6]:
val PATH = "file:///Users/lzz/work/SparkML/"

# 从MovieLens 数据集提取特征

In [7]:
val movies = sc.textFile( PATH+"data/ml-100k/u.item")
println( movies.first )

1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0


In [8]:
val genres = sc.textFile( PATH + "data/ml-100k/u.genre")
genres.take(5).foreach(println)

unknown|0
Action|1
Adventure|2
Animation|3
Children's|4


In [16]:
val genreMap = genres.filter( !_.isEmpty ).map( line => line.split("\\|")).
map(array => (array(1),array(0))).collectAsMap
println(genreMap)

Map(2 -> Adventure, 5 -> Comedy, 12 -> Musical, 15 -> Sci-Fi, 8 -> Drama, 18 -> Western, 7 -> Documentary, 17 -> War, 1 -> Action, 4 -> Children's, 11 -> Horror, 14 -> Romance, 6 -> Crime, 0 -> unknown, 9 -> Fantasy, 16 -> Thriller, 3 -> Animation, 10 -> Film-Noir, 13 -> Mystery)


In [17]:
val titlesAndGenres = movies.map(_.split("\\|")).map{ array =>
    val genres = array.toSeq.slice(5, array.size)
    val genresAssigned = genres.zipWithIndex.filter{ 
        case (g, idx) =>
            g == "1"
    }.map{
        case (g, idx) => genreMap(idx.toString)
    }
    (array(0).toInt, (array(1), genresAssigned))
}
println(titlesAndGenres.first)

(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))


In [20]:
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
val rawData = sc.textFile( PATH + "data/ml-100k/u.data" )
val rawRatings = rawData.map( _.split("\t").take(3) )
val ratings = rawRatings.map{ case Array(user, movie, rating) =>
    Rating(user.toInt, movie.toInt, rating.toDouble)
}
ratings.cache
val alsModel = ALS.train(ratings, 50, 10, 0.1)

In [21]:
import org.apache.spark.mllib.linalg.Vectors
val movieFactors = alsModel.productFeatures.map{ 
    case(id, factor) => (id, Vectors.dense(factor) )
}
val movieVectors = movieFactors.map( _._2 )
val userFactors = alsModel.userFeatures.map {
    case( id, factor ) => (id, Vectors.dense(factor) )
}
val userVectors = userFactors.map( _._2 )

In [23]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix
val movieMatrix = new RowMatrix( movieVectors )
val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics()
val userMatrix = new RowMatrix(userVectors)
val userMatrixSummary = userMatrix.computeColumnSummaryStatistics()
println("Movie factors mean: " + movieMatrixSummary.mean )
println( "Movie factors variance: " + movieMatrixSummary.variance )
println( "User factors mean: " + userMatrixSummary.mean )
println( "User factors variance: " + userMatrixSummary.variance )

Movie factors mean: [-0.010956087400287078,-0.3503618170650178,0.06974321778051942,-0.1316383504373148,-0.3175018092607278,-0.1322647106693895,0.2537778433328245,-0.2029801309890879,-0.12961386425209565,-0.07082069156194899,0.06205439440201192,0.29001069000651375,-0.18597369739464903,-0.23937735319862308,-0.14342491847545685,0.18101149542654485,-0.13804024072643173,0.2850367666048985,0.12990755598051892,-0.2940282667527131,0.10936497810207288,-0.056700820289192294,0.008779148148335511,-0.09952976911271119,-0.12124526842150213,-0.1047346330659357,-0.01800057373980534,0.20805219400947975,-0.025770092278983504,-0.16276854660068052,0.14352873442556296,-0.04190886803719547,0.4564002303815088,-0.13991199983838468,0.24892268343949558,0.03683881854085137,-0.12161360237334293,0.05101734977006077,-0.1716588720612823,-0.25528384724181186,-0.24128689111316304,0.3161922110050902,-0.37518932943231786,-0.07739395342787778,-0.033145745157435264,-0.3182618436521061,0.31724803911912525,-0.29054483322329

## 训练聚类模型

In [24]:
import org.apache.spark.mllib.clustering.KMeans
val numClusters = 5
val numiterations = 10
val numRuns = 3
val movieClusterModel = KMeans.train( movieVectors, numClusters, numiterations, numRuns)

In [25]:
val movieClusterModelConverged = KMeans.train( movieVectors, numClusters, 100)

In [26]:
val userClusterModel = KMeans.train( userVectors, numClusters, numiterations, numRuns)

## 聚类模型预测

In [27]:
val movie1 = movieVectors.first
val movieCluster = movieClusterModel.predict(movie1)
println( movieCluster )

3


In [28]:
val predictions = movieClusterModel.predict( movieVectors )
println( predictions.take(10).mkString(",") )

3,1,0,1,0,1,1,0,0,1


In [30]:
import breeze.linalg._
import breeze.numerics.pow
def computeDistance( v1: DenseVector[Double], v2: DenseVector[Double] ) = pow( v1 - v2, 2 ).sum

In [31]:
val titlesWithFactors = titlesAndGenres.join(movieFactors)
val moviesAssigned = titlesWithFactors.map{
    case ( id, ((title, genres), vector)) =>
        val pred = movieClusterModel.predict( vector )
        val clusterCentre = movieClusterModel.clusterCenters( pred )
        val dist = computeDistance( DenseVector( clusterCentre.toArray), DenseVector(vector.toArray) )
            (id, title, genres.mkString(" "), pred, dist)
}
val clusterAssignments = moviesAssigned.groupBy{
    case (id, title, genres, cluster, dist) => cluster
}.collectAsMap

In [33]:
for( (k, v) <- clusterAssignments.toSeq.sortBy(_._1) ){
    println( s"Cluster $k:" )
    val m = v.toSeq.sortBy( _._5 )
    println( m.take(20).map{ 
        case( _, title, genres, _, d) => (title, genres, d)
    }.mkString("\n"))
    println("---------\n")
}

Cluster 0:
(King of the Hill (1993),Drama,0.16150193866972645)
(Witness (1985),Drama Romance Thriller,0.2274559199018781)
(All Over Me (1997),Drama,0.2629440718963916)
(Scream of Stone (Schrei aus Stein) (1991),Drama,0.26750415112992426)
(Ed's Next Move (1996),Comedy,0.33834019947892024)
(I Can't Sleep (J'ai pas sommeil) (1994),Drama Thriller,0.34065424343733147)
(Wings of Courage (1995),Adventure Romance,0.3574027323494375)
(Silence of the Palace, The (Saimt el Qusur) (1994),Drama,0.3667946097617337)
(Land and Freedom (Tierra y libertad) (1995),War,0.3667946097617337)
(Normal Life (1996),Crime Drama,0.3667946097617337)
(Eighth Day, The (1996),Drama,0.3667946097617337)
(Two Friends (1986) ,Drama,0.3667946097617337)
(Dadetown (1995),Documentary,0.3667946097617337)
(Girls Town (1996),Drama,0.3667946097617337)
(Big One, The (1997),Comedy Documentary,0.3667946097617337)
(Hana-bi (1997),Comedy Crime Drama,0.3667946097617337)
(� k�ldum klaka (Cold Fever) (1994),Comedy Drama,0.366794609761733

 评估模型的性能

In [34]:
val movieCost = movieClusterModel.computeCost( movieVectors )
val userCost = userClusterModel.computeCost( userVectors )
println( "WCSS for movies: " + movieCost )
println( "WCSS for movies: " + userCost )

WCSS for movies: 2282.7384462397395
WCSS for movies: 1476.204833283481


In [36]:
val trainTestSplitMovies = movieVectors.randomSplit( Array(0.6, 0.4), 123 )
val trainMovies = trainTestSplitMovies(0)
val testMovies = trainTestSplitMovies(1)
val costsMovies = Seq(2 ,3 ,4 ,5 ,10 ,20).map{
    k => (k, KMeans.train(trainMovies, numiterations, k, numRuns).computeCost(testMovies) )
}
println( "Movie clustering cross-validation:")
costsMovies.foreach{ case( k, cost) => println(f"WCSS for K=$k id $cost%2.2f") }

Movie clustering cross-validation:
WCSS for K=2 id 884.89
WCSS for K=3 id 867.80
WCSS for K=4 id 872.01
WCSS for K=5 id 879.41
WCSS for K=10 id 863.81
WCSS for K=20 id 860.85


In [38]:
val trainTestSplitUsers = userVectors.randomSplit( Array(0.6, 0.4), 123)
val trainUsers = trainTestSplitUsers(0)
val testUsers = trainTestSplitUsers(1)
val costsUsers = Seq( 2, 3, 4, 5 , 10, 20).map{
    k => (k, KMeans.train(trainUsers, numiterations, k, numRuns).computeCost(testUsers))
}
println( "User clustering cross-validation:")
costsUsers.foreach{
    case(k, cost) => println( f"WCSS for K=$k id $cost%2.2f")
}

User clustering cross-validation:
WCSS for K=2 id 591.78
WCSS for K=3 id 594.46
WCSS for K=4 id 598.01
WCSS for K=5 id 594.43
WCSS for K=10 id 596.87
WCSS for K=20 id 594.06
