In [ ]:
// créer spark session
val sparkSession = SparkSession.builder()
        .appName("similarite_produits")
        .getOrCreate()

In [ ]:
// importer des librairies

import org.apache.spark._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.commons.lang3.StringUtils.stripAccents
import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ListBuffer
import org.apache.spark.storage.StorageLevel
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Vectors

import java.util.Properties
import com.typesafe.config._
import scala.collection.JavaConverters._

import sparkSession.implicits._

In [ ]:
val idsite = 1
val url_domain = "www.scandi-vie.com"
val nbReco = 5

# Lire les données dans HDFS

In [ ]:
val df = sparkSession.read.format("com.databricks.spark.csv")
              .option("header", "true").option("inferSchema","true")//.option("mode", "DROPMALFORMED")
              .load(s"hdfs://ecoles.node1.pro.hupi.loc:8020/user/ecoles/scandivie/products.csv")
             
df.createOrReplaceTempView("df")

In [ ]:
// Imprimer le schéma
df.printSchema()

In [ ]:
// On sélectionnes les colonnes nécessaires pour le modèle
val data = df.select("name", "product_id", "description")
            .map(l => (l(0).asInstanceOf[String], l(1).toString,
                       l(2).asInstanceOf[String]))

In [ ]:
data.count()

# Création des fonctions intermédiaires

In [ ]:
// Création de fonction qui transforme en bonne format
def format (input : String) : String = {
  val r = input.replaceAll("(?s)<[^>]*>(\\s*<[^>]*>)*", "")
    .toLowerCase()
    .replace(".", "").replace(";", "").replace(",", "").replace(":", " ")
    .replace("!", "").replace("/", "").replace("§", "").replace("?", "")
    .replace("*", "").replace("-", "").replace("&", "").replace("~", "")
    .replace("#", "").replace("'", " ").replace("(", "").replace("|", "")
    .replace("`", "").replace("_", "").replace("\\", "").replace("^", "")
    .replace("@", "").replace(")", "").replace("]", "").replace("[", "")
    .replace("=", "").replace("}", "").replace("{", "").replace("\n", "")
    .replace("\"", "").replace("+", "").replace("<", "").replace(">", "")
  val res = org.apache.commons.lang3.StringUtils.stripAccents(r)
  return res
}

// On cherche liste des stopWord (des mots inutiles) puis utiliser fonction getGoodWords qui filtre des stopWords
val stopWords = sc.textFile(s"hdfs://ecoles.node1.pro.hupi.loc:8020/user/ecoles/scandivie/StopWordsClean.txt").collect()

def getGoodWords (input: Array[String]) : Array[String] = {
  val list = input.filterNot(l => stopWords.contains(l))
  return list
}

# Transformer les données pour TF-IDF

In [ ]:
// On fait concat les colonnes noms et descriptions de produits
val atelier = data.map(l => (l._2, format(l._1.concat(" ").concat(l._3)))).rdd

In [ ]:
atelier.count()

In [ ]:
val baseAtelier = atelier.map(l => l._2).map(l => l.split(" ").filter(l => l != ""))

	// On supprime les stopWords et les accents
	val documents = baseAtelier.map(l => getGoodWords(l).map(s => stripAccents(s)).toSeq)
	val hashingTF = new HashingTF()
	val tf = hashingTF.transform(documents)

In [ ]:
// on génère le modèle TF IDF
val idf = new IDF().fit(tf)
val tfidf = idf.transform(tf) 

val index = atelier.map(l => l._1.toLong)
val list = index.cartesian(index).filter{case (x, y) => x != y}
val idToIdx = index.zipWithIndex.map(l => (l._1, l._2.toInt)).collect().toMap

In [ ]:
val docuM = documents.collect()
val tfIdf = tfidf.collect()

In [ ]:
// Fonction qui calcule la similarité de cosinus entre deux documents
def createMatrixEntry (a : Long) : Array[MatrixEntry] = {
  val id = idToIdx(a)
  val doc = docuM(id).distinct
  val list1 = new ListBuffer[MatrixEntry]()
  for (i <- 0 to doc.length-1){
    val term = doc(i)
    val position = hashingTF.indexOf(term)
    val value1 = tfIdf(id).apply(position)
    list1 += MatrixEntry(position, a, value1)
  }
  val Array1 =  list1.toArray
  return Array1
} 

# Créer la matrice pour faire le modèle de similarité

In [ ]:
val dataWithWeights = index.map(l => createMatrixEntry(l)).flatMap(l => l).distinct

val mat = new CoordinateMatrix(dataWithWeights).toIndexedRowMatrix()

In [ ]:
val simCos = mat.columnSimilarities().entries.map{ case MatrixEntry(i, j, v) => Array(Array(i, j, v), Array(j, i, v))}
                      .flatMap(l => l).map(l => (l(0).toShort, l(1).toShort, l(2)))

val k = simCos.map(l => (l._1, (l._2, l._3)))

In [ ]:
k.take(10)

# Prépare output pour enregistrer en MongoDB

In [ ]:
// On doit filtrer des produits recommandés out-of-stock
val finalList = k.groupByKey()//.map(l => (l._1, l._2.toList.filter(s => status_prod(s._1.toInt) == 1)))
  .map(l => (l._1, l._2.toList.sortBy(_._2).reverse.take(nbReco))).map(l => (l._1, l._2.map(s => s._1)))
  .map(l => l._2.map(m => (l._1.asInstanceOf[Long], m.asInstanceOf[Long], idsite, url_domain))).flatMap(k => k)

In [ ]:
// find name produits avec id
val findName = data.map(l => (l._2.toLong, l._1)).rdd.distinct.collectAsMap

In [ ]:
finalList.map(l => (findName(l._1), findName(l._2), l._3, l._4)).take(100)

# Enregistrement en MongoDB

In [ ]:
// convertir RDD en dataframe
val finalRdd = finalList.toDF("id", "idR", "idsite", "url_domain")

// on sauvegarde en MongoDB
finalRdd.write.format("com.mongodb.spark.sql")
.option("uri", "mongodb://ecoles.mongo.pro.hupi.loc:27017/ecoles.similarite_produits_scandivie").mode("overwrite")
         .save()