# Lancer la spark session

In [69]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "5g") \
    .appName('app') \
    .getOrCreate()

In [68]:
spark.stop()

### On installe nltk pour les stop words

In [2]:
!pip install nltk



### On importe les bibliotheques que l'on va utiliser

In [70]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *

from pyspark.ml.classification import *
from pyspark.ml import Pipeline
from pyspark.ml.tuning import *
from pyspark.ml.feature import  StringIndexer
import string


In [71]:
from pyspark.ml.evaluation import *

In [72]:
import nltk.corpus 
from nltk.corpus import stopwords

In [73]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### On convertit le csv en spark df

In [74]:
path = "DBPEDIA_train.csv"
df_train = spark.read.format('csv').options(header=True,delimiter=',').load(path)


In [13]:
df_train.schema

StructType(List(StructField(text,StringType,true),StructField(l1,StringType,true),StructField(l2,StringType,true),StructField(l3,StringType,true)))

### On observe nos données

In [14]:
df_train.count()

                                                                                

240942

## On remarque que nos données contiennent des lignes qui n'ont pas de labels donc il faut les degager

In [77]:
df_train=df_train.dropna()
df_train=df_train.filter((df_train.l1 == "Agent") | (df_train.l1=='Event') | (df_train.l1=='Species') | (df_train.l1=='Place')| (df_train.l1=='UnitOfWork')| (df_train.l1=='SportsSeason')| (df_train.l1=='TopicalConcept'))

In [16]:
df_train.select('l1').distinct().show()

[Stage 5:>                                                          (0 + 8) / 8]

+--------------+
|            l1|
+--------------+
|       Species|
|  SportsSeason|
|         Place|
|    UnitOfWork|
|         Event|
|TopicalConcept|
|         Agent|
+--------------+



                                                                                

In [15]:
df_train.select('l1').count()

                                                                                

192235

 On a finalement 7 labels et on a perdu environ 60k lignes sur 260k

In [78]:
add_stopwords= stopwords.words('english')

##### On definit la liste des stop words que l'on veut enlever

In [79]:
l = ["http","https","amp","rt","t","c","the"]
for i in l:
    add_stopwords.append(i)

In [80]:
s=list(string.punctuation)

In [81]:
for i in s:
    add_stopwords.append(i)

In [82]:
print('let s go')

let s go


## Pre-processing

Pour cette etape on a voulu tester plusieurs types de pre-pro

In [103]:
def apply_pipeline(pipeline,df):
    pipelineFit=pipeline.fit(df)
    return pipelineFit

###### Count vectorizer

In [104]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
# stop words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "l3", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors,label_stringIdx])

In [23]:
dataset_l1_p1=apply_pipeline(pipeline,df_train)

                                                                                

In [24]:
datal1p1 = dataset_l1_p1.transform(df_train)

In [18]:
dataset_l2_p1=apply_pipeline(pipeline,df_train)

                                                                                

In [16]:
from pyspark.ml import PipelineModel

pipe = PipelineModel.load("model_pipel2p1")

                                                                                

In [17]:
datal2p1 = pipe.transform(df_train)

In [105]:
dataset_l3_p1=apply_pipeline(pipeline,df_train)

                                                                                

In [106]:
datal3p1 = dataset_l3_p1.transform(df_train)

In [107]:
dataset_l3_p1.write().overwrite().save("model_pipel3p1")

                                                                                

In [20]:

dataset_l2_p1.write().overwrite().save("model_pipel2p1")

                                                                                

In [None]:

dataset_l3_p1.write().overwrite().save("model_pipel3p1")

Nos données sont pretes à etre utilisés dans des modeles de Ml

###### TF/IDF

In [108]:
ngram=NGram(inputCol=stopwordsRemover.getOutputCol(),outputCol="Ngrams")

In [109]:
hashingTF=HashingTF(inputCol=ngram.getOutputCol(),outputCol="features",numFeatures=10000)

In [112]:
idf=IDF(minDocFreq=3,inputCol=hashingTF.getOutputCol(),outputCol="final")

In [113]:
label_stringIdx = StringIndexer(inputCol = "l3", outputCol = "label")

In [114]:
pipeline_prepro=Pipeline(stages=[regexTokenizer,stopwordsRemover,ngram,hashingTF,idf,label_stringIdx])

In [22]:
dataset_l1_p2=apply_pipeline(pipeline_prepro,df_train)

                                                                                

In [28]:
datal1p2 = dataset_l1_p2.transform(df_train)

In [29]:
datal1p2.printSchema()

root
 |-- text: string (nullable = true)
 |-- l1: string (nullable = true)
 |-- l2: string (nullable = true)
 |-- l3: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Ngrams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- final: vector (nullable = true)
 |-- label: double (nullable = false)



In [67]:
spark.stop()

In [96]:
dataset_l2_p2=apply_pipeline(pipeline_prepro,df_train)

                                                                                

In [23]:
from pyspark.ml import PipelineModel



In [97]:
datal2p2 = dataset_l2_p2.transform(df_train)

In [115]:
dataset_l3_p2=apply_pipeline(pipeline_prepro,df_train)

                                                                                

In [116]:
datal3p2 = dataset_l3_p2.transform(df_train)

In [98]:
dataset_l2_p2.write().overwrite().save("model_pipel2p2")

In [28]:
dataset_l2_p2.write().overwrite().save("model_pipel2p2")

                                                                                

In [117]:
dataset_l3_p2.write().overwrite().save("model_pipel3p2")

Word2Vec prend trop de temps du coup on ne va pas le faire 

## Classification

##### definition de modele

In [118]:
lr = LogisticRegression(featuresCol='features',labelCol='label')

###### level 1

In [119]:
def fit_model(model,dataset):
    (trainingData, testData) = dataset.randomSplit([0.9, 0.1], seed = 100)
    lrModel = lr.fit(trainingData)
    return lrModel  

In [36]:
fitl1_1=fit_model(lr,datal1p1)

                                                                                

In [30]:
fitl1_2=fit_model(lr,datal1p2)

                                                                                

In [22]:
fitl2_1=fit_model(lr,datal2p1)

                                                                                

In [101]:
fitl2_2=fit_model(lr,datal2p2)

                                                                                

In [None]:
fitl3_1=fit_model(lr,datal3p1)

                                                                                

In [None]:
fitl3_2=fit_model(lr,datal3p2)

In [102]:
fitl2_2.write().overwrite().save("model_l2_p2")

                                                                                

In [32]:
spark.stop()

In [34]:

fitl3_1.write().overwrite().save("model_l3_p1")

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:

fitl3_1.write().overwrite().save("model_l3_p1")
fitl3_2.write().overwrite().save("model_l3_p2")

In [60]:
df=risque.select(col("label"),col("text_class")).distinct()

In [73]:
df.collect()[0][0]

2.0

In [76]:
df.collect()[0][1]

'Species'

In [62]:
n=df.collect()

                                                                                

In [71]:
type(n[0])
dico={}
for row in range(0,len(n)):
    
    
    
    
    
    

In [100]:
test.show()


[Stage 156:>                                                        (0 + 1) / 1]

+--------------------+--------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|                text|            l1|               words|            filtered|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|"""Echourouk TV\"...|         Agent|[echourouk, tv, a...|[echourouk, tv, a...|(10000,[1,2,3,4,5...|  0.0|[2.99682356653826...|[0.71343850416859...|       0.0|
|"3757 Anagolay, p...|         Place|[3757, anagolay, ...|[3757, anagolay, ...|(10000,[0,1,2,3,4...|  1.0|[0.45869983148295...|[0.00144256628818...|       1.0|
|"Abies hidalgensi...|       Species|[abies, hidalgens...|[abies, hidalgens...|(10000,[0,1,2,3,4...|  2.0|[-0.2417903089158...|[0.00138855432011...|       2.0|
|"American Communi...|    UnitOfWork|[am

                                                                                

In [28]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName='f1')

###### Score avec Count Vectorizer

In [32]:
evaluator.evaluate(test1)

NameError: name 'test1' is not defined

###### Score avec TF/IDF

In [46]:
evaluator.evaluate(test2)

                                                                                

0.916509190912641

###### Score avec Word2Vec

In [47]:
evaluator.evaluate(test3)

                                                                                

0.8505022768303262

### On passe à l'optimisation d'hyper parametres, tentative 1:

In [3]:
path = "DBPEDIA_val.csv/DBPEDIA_val.csv"
df_val = spark.read.format('csv').options(header=True,delimiter=',').load(path)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName='f1')
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
lr = LogisticRegression(featuresCol='features',labelCol='label')
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
    parallelism=2)
cvModel = cv.fit(data)

[Stage 116:>                                                        (0 + 8) / 8]

In [None]:
cvModel

In [34]:
cvm = cvModel.transform(data)

In [35]:
cvm.show()

+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|                text|        l1|                  l2|                  l3|               words|            filtered|            features|label|       rawPrediction|         probability|prediction|
+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|William Alexander...|     Agent|          Politician|             Senator|[william, alexand...|[william, alexand...|(10000,[0,8,17,19...|  0.0|[5.02644512438456...|[0.97139377838186...|       0.0|
|Pirqa (Aymara and...|     Place|        NaturalPlace|            Mountain|[pirqa, aymara, a...|[pirqa, aymara, q...|(10000,[22,37,61,...|  1.0|[1.27078119837361...|[0.08807323626568...|       1.0|
|Sistrurus

### Tentative de streaming