In [2]:
import findspark
findspark.init()
import pyspark
from operator import add
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [3]:
df = spark.read.json("data/src/main/resources/people.json")

In [4]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [7]:
df.filter(df['age']>20).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [8]:
df.groupBy('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [9]:
from pyspark.sql import Row

sc=spark.sparkContext

In [10]:
lines = sc.textFile('data/src/main/resources/people.txt')

In [11]:
lines.count()

3

In [12]:
lines.collect()

['Michael, 29', 'Andy, 30', 'Justin, 19']

In [13]:
parts = lines.map(lambda x:x.split(','))
parts.collect()

[['Michael', ' 29'], ['Andy', ' 30'], ['Justin', ' 19']]

In [14]:
people = parts.map(lambda x:Row(name=x[0], age=int(x[1])))

In [15]:
people.collect()

[Row(age=29, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

In [16]:
schemaPeople = spark.createDataFrame(people)

In [17]:
schemaPeople.show()

+---+-------+
|age|   name|
+---+-------+
| 29|Michael|
| 30|   Andy|
| 19| Justin|
+---+-------+



In [18]:
parts.collect()

[['Michael', ' 29'], ['Andy', ' 30'], ['Justin', ' 19']]

In [19]:
people=parts.map(lambda x:(x[0],x[1].split()))

In [20]:
people.collect()

[('Michael', ['29']), ('Andy', ['30']), ('Justin', ['19'])]

In [21]:
df = spark.read.load("data/src/main/resources/users.parquet")

In [23]:
df.select('name','favorite_color').write.save('nameAndColor.parquet')

In [24]:
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [25]:
jsonStrings = ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']
otherPeopleRDD = sc.parallelize(jsonStrings)
otherPeople = spark.read.json(otherPeopleRDD)

In [26]:
otherPeople.show()

+---------------+----+
|        address|name|
+---------------+----+
|[Columbus,Ohio]| Yin|
+---------------+----+



In [30]:
from operator import add
x = sc.parallelize(['sparck','rdd','example','rdd'])
y=x.map(lambda x:(x,1)).reduceByKey(add)
y.collect()

[('sparck', 1), ('example', 1), ('rdd', 2)]

In [32]:
y = x.map(lambda x:(x,len(x))).reduceByKey(add)
y.collect()

[('sparck', 6), ('example', 7), ('rdd', 6)]

In [35]:
x = sc.parallelize([1,2,3,4,5,6,7,8,9,10])
y= x.filter(lambda x:x%2==0)
y.count()

5

In [58]:
x = sc.parallelize(['marti','alicia','vivian'])
y=x.map(lambda x:(x,'a' in x))
y.collect()

[('marti', True), ('alicia', True), ('vivian', True)]

In [52]:
x=sc.parallelize([1,2,3,4,5,6,7,8,9,10])
suma = x.reduce(lambda suma,x: suma+x)
suma

55

In [59]:
x = sc.parallelize(["Joseph", "Jimmy", "Tina","Thomas", "James", "Cory","Christine", "Jackeline", "Juan"], 3)

In [63]:
y=x.groupBy(lambda word:word[0])
print ([(t[0],[i for i in t[1]]) for t in y.collect()])

[('J', ['Joseph', 'Jimmy', 'James', 'Jackeline', 'Juan']), ('C', ['Cory', 'Christine']), ('T', ['Tina', 'Thomas'])]
