### 1. creation d'une session spark

In [1]:
from pyspark.sql import SparkSession

# Sets the Spark master URL to connect to, such as “local” to run locally, 
# “local[4]” to run locally with 4 cores, or “spark://master:7077” to run 
# on a Spark standalone cluster.
spark = SparkSession.builder.master("local[1]")\
                    .appName("demoRDD")\
                    .getOrCreate() # Get or instantiate a SparkContext and register it as a singleton object.

### 2. Creation d'un RDD à partir d'un fichier text

In [2]:
rdd01 = spark.sparkContext.textFile('../data/test.txt')
rdd01

../data/test.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

### 3.a affichage du RDD

In [6]:
print(rdd01)

../data/test.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0


### 3.b affichage des données du RDD => operation de recuperation des données

In [13]:
words_list = rdd01.collect()
words_list[:6]

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'by Lewis Carroll',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with']

In [14]:
import re

words_list = rdd01.flatMap(lambda x: re.split(r'\W', x)).collect()
words_lowercase_list = [w.lower() for w in words_list]
words_count_dict = {w: words_lowercase_list.count(w) for w in words_lowercase_list}
words_count_dict

{'project': 9,
 'gutenberg': 9,
 's': 27,
 'alice': 18,
 'adventures': 18,
 'in': 18,
 'wonderland': 18,
 'by': 18,
 'lewis': 18,
 'carroll': 18,
 'this': 27,
 'ebook': 27,
 'is': 27,
 'for': 27,
 'the': 27,
 'use': 27,
 'of': 27,
 'anyone': 27,
 'anywhere': 27,
 'at': 27,
 'no': 27,
 'cost': 27,
 'and': 27,
 'with': 27,
 'poulet': 1}

### 4. Afficher le nombre de lignes dans le fichier

In [15]:
rdd01.count()

127

### 5. Afficher les 10 premières lignes du fichier

In [16]:
rdd01.take(10)

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'by Lewis Carroll',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'Alice’s Adventures in Wonderland',
 'by Lewis Carroll',
 'This eBook is for the use',
 'of anyone anywhere']

### 6. Filtrer les lignes qui contiennent le mot "Lewis"

In [37]:
rdd_lignes_contenant_lewis = rdd01.filter(lambda line: 'lewis'.casefold() in line.casefold()).collect()
rdd_lignes_contenant_lewis

['by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll',
 'by Lewis Carroll']

### 7. Transformer les lignes qui contiennent lewis en majuscule
### cas 1 : à partir du debut

In [43]:
rdd01.map(lambda line: line.upper() if 'lewis'.casefold() in line.casefold() else line)\
     .collect()[:20]

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'BY LEWIS CARROLL',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'Alice’s Adventures in Wonderland',
 'BY LEWIS CARROLL',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'BY LEWIS CARROLL',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with']

### cas 2 : en réutilisant un RDD cree precedemment

In [44]:
rdd01.map(lambda line: line.upper() if line in rdd_lignes_contenant_lewis else line)\
     .collect()[:20]

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'BY LEWIS CARROLL',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'Alice’s Adventures in Wonderland',
 'BY LEWIS CARROLL',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'BY LEWIS CARROLL',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with']

### 8. Compter le nombre d'occurences du mot "Wonderland" dans chaque ligne

In [71]:
rdd01.map(lambda line: line.count('Wonderland'))\
     .collect().__repr__()

'[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]'

In [70]:
rdd01.zipWithIndex()\
     .map(lambda tupl: (tupl[1], tupl[0], tupl[0].count('Wonderland')))\
     .map(lambda tupl: (tupl[1], tupl[2]))\
     .collect()

[('Project Gutenberg’s', 0),
 ('Alice’s Adventures in Wonderland', 1),
 ('by Lewis Carroll', 0),
 ('This eBook is for the use', 0),
 ('of anyone anywhere', 0),
 ('at no cost and with', 0),
 ('Alice’s Adventures in Wonderland', 1),
 ('by Lewis Carroll', 0),
 ('This eBook is for the use', 0),
 ('of anyone anywhere', 0),
 ('at no cost and with', 0),
 ('This eBook is for the use', 0),
 ('of anyone anywhere', 0),
 ('at no cost and with', 0),
 ('Project Gutenberg’s', 0),
 ('Alice’s Adventures in Wonderland', 1),
 ('by Lewis Carroll', 0),
 ('This eBook is for the use', 0),
 ('of anyone anywhere', 0),
 ('at no cost and with', 0),
 ('Alice’s Adventures in Wonderland', 1),
 ('by Lewis Carroll', 0),
 ('This eBook is for the use', 0),
 ('of anyone anywhere', 0),
 ('at no cost and with', 0),
 ('This eBook is for the use', 0),
 ('of anyone anywhere', 0),
 ('at no cost and with', 0),
 ('Project Gutenberg’s', 0),
 ('Alice’s Adventures in Wonderland', 1),
 ('by Lewis Carroll', 0),
 ('This eBook is for 