# SparkContext

## méthode 1

In [1]:
from pyspark import SparkContext, SparkConf
sc = SparkContext(conf=SparkConf().setAppName('Demo RDD'))

In [2]:
sc

## méthode 2

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Demo RDD').getOrCreate()
sc = spark.sparkContext # ou spark._sc
sc

# Création de RDD

## Parallélisation de collections

In [4]:
rdd = sc.parallelize([i for i in range (30)])

In [5]:
rdd

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

## À partir de fichiers
```python
sc.textFile(chemin)
```
**chemin** : peut être le nom d'un fichier ou dossier avec possibilité de caractères génériques.

*Exemples* : 'data.csv', 'datasets/*.csv', 'datasets' 

**Autres emplacements** : le chemin est local par rapport au noeud. Il est possible d'accèder à HDFS, Cassandra, Hbase, Amazon S3, ... avec des l'URI correspondante hdfs://, s3a://, ...

### Lecture de lignes séparées

In [6]:
rddText = sc.textFile('zipcodes.csv')
rddText

zipcodes.csv MapPartitionsRDD[2] at textFile at NativeMethodAccessorImpl.java:0

In [7]:
rddText.count()

42050

In [8]:
rddText.take(1)

['zip_code,latitude,longitude,city,state,county']

In [9]:
! mkdir files
! echo "exemple de texte" > files/f1.txt
! echo 'un autre exemple' > files/f2.txt

### Lecture du contenu entier
Création de PairRDD : Les éléments sont des tuples de la forme (chemin, contenu)

In [10]:
rddFullText = sc.wholeTextFiles('files/*.txt')
rddFullText

org.apache.spark.api.java.JavaPairRDD@3c636475

In [11]:
rddFullText.count()

2

In [12]:
rddFullText.collect()

[('file:/opt/spark-apps/files/f1.txt', 'exemple de texte\n'),
 ('file:/opt/spark-apps/files/f2.txt', 'un autre exemple\n')]

# Actions

In [13]:
rdd.collect()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [14]:
rdd.count()

30

In [15]:
rdd.take(3)

[0, 1, 2]

In [16]:
rdd.takeSample(withReplacement=True,num=10,seed=10)

[8, 13, 26, 13, 27, 16, 3, 28, 3, 20]

In [17]:
rdd.reduce(lambda a,b: a+b)

435

In [18]:
rdd.saveAsTextFile('rddOut')

In [19]:
!cat rddOut/*

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


# Transformations

In [20]:
rdd.map(lambda i:i**2).collect()

[0,
 1,
 4,
 9,
 16,
 25,
 36,
 49,
 64,
 81,
 100,
 121,
 144,
 169,
 196,
 225,
 256,
 289,
 324,
 361,
 400,
 441,
 484,
 529,
 576,
 625,
 676,
 729,
 784,
 841]

In [21]:
def square(i):
    return i**2
rdd.map(square).collect()

[0,
 1,
 4,
 9,
 16,
 25,
 36,
 49,
 64,
 81,
 100,
 121,
 144,
 169,
 196,
 225,
 256,
 289,
 324,
 361,
 400,
 441,
 484,
 529,
 576,
 625,
 676,
 729,
 784,
 841]

In [22]:
from math import sqrt
rdd.map(sqrt).collect()

[0.0,
 1.0,
 1.4142135623730951,
 1.7320508075688772,
 2.0,
 2.23606797749979,
 2.449489742783178,
 2.6457513110645907,
 2.8284271247461903,
 3.0,
 3.1622776601683795,
 3.3166247903554,
 3.4641016151377544,
 3.605551275463989,
 3.7416573867739413,
 3.872983346207417,
 4.0,
 4.123105625617661,
 4.242640687119285,
 4.358898943540674,
 4.47213595499958,
 4.58257569495584,
 4.69041575982343,
 4.795831523312719,
 4.898979485566356,
 5.0,
 5.0990195135927845,
 5.196152422706632,
 5.291502622129181,
 5.385164807134504]

In [23]:
rdd.filter(lambda x:x%2==0).collect()

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28]

In [24]:
rdd.flatMap(lambda i:list(str(i))).collect()

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '1',
 '0',
 '1',
 '1',
 '1',
 '2',
 '1',
 '3',
 '1',
 '4',
 '1',
 '5',
 '1',
 '6',
 '1',
 '7',
 '1',
 '8',
 '1',
 '9',
 '2',
 '0',
 '2',
 '1',
 '2',
 '2',
 '2',
 '3',
 '2',
 '4',
 '2',
 '5',
 '2',
 '6',
 '2',
 '7',
 '2',
 '8',
 '2',
 '9']

In [25]:
mult3 = rdd.filter(lambda x:x%3==0)
mult5 = rdd.filter(lambda x:x%5==0)
mult3.intersection(mult5).collect()

[0, 15]

In [26]:
rdd.sortBy(lambda x:x, ascending=False).collect()

[29,
 28,
 27,
 26,
 25,
 24,
 23,
 22,
 21,
 20,
 19,
 18,
 17,
 16,
 15,
 14,
 13,
 12,
 11,
 10,
 9,
 8,
 7,
 6,
 5,
 4,
 3,
 2,
 1,
 0]

# PairRDD

In [27]:
# Créer un PairRDD de la forme (nombre, sommeChiffres)
def somme(i):
    l = list(str(i))
    l = [int(x) for x in l]
    return sum(l)
prdd = rdd.map(lambda x: (somme(x), x))

In [28]:
prdd.collect()

[(0, 0),
 (1, 1),
 (2, 2),
 (3, 3),
 (4, 4),
 (5, 5),
 (6, 6),
 (7, 7),
 (8, 8),
 (9, 9),
 (1, 10),
 (2, 11),
 (3, 12),
 (4, 13),
 (5, 14),
 (6, 15),
 (7, 16),
 (8, 17),
 (9, 18),
 (10, 19),
 (2, 20),
 (3, 21),
 (4, 22),
 (5, 23),
 (6, 24),
 (7, 25),
 (8, 26),
 (9, 27),
 (10, 28),
 (11, 29)]

In [29]:
# Grouper par la clé : les nombres qui ont la même somme de leurs chiffres exemple [3, 12, 21, ..]

res1 = prdd.groupByKey().collect()
sorted([(x, sorted(y)) for (x,y) in res1])

[(0, [0]),
 (1, [1, 10]),
 (2, [2, 11, 20]),
 (3, [3, 12, 21]),
 (4, [4, 13, 22]),
 (5, [5, 14, 23]),
 (6, [6, 15, 24]),
 (7, [7, 16, 25]),
 (8, [8, 17, 26]),
 (9, [9, 18, 27]),
 (10, [19, 28]),
 (11, [29])]

In [30]:
# Le même résultat peut être obtenu avec groupBy

res =rdd.groupBy(lambda x:somme(x)).collect()
sorted([(x, sorted(y)) for (x,y) in res])

[(0, [0]),
 (1, [1, 10]),
 (2, [2, 11, 20]),
 (3, [3, 12, 21]),
 (4, [4, 13, 22]),
 (5, [5, 14, 23]),
 (6, [6, 15, 24]),
 (7, [7, 16, 25]),
 (8, [8, 17, 26]),
 (9, [9, 18, 27]),
 (10, [19, 28]),
 (11, [29])]

In [31]:
prdd.countByKey()

defaultdict(int,
            {0: 1,
             1: 2,
             2: 3,
             3: 3,
             4: 3,
             5: 3,
             6: 3,
             7: 3,
             8: 3,
             9: 3,
             10: 2,
             11: 1})