# createDataFrame

In [18]:
a = [('Chris', 'Berliner', 5)]
sqlContext.createDataFrame(a, ['drinker', 'beer', 'score']).collect()

[Row(drinker='Chris', beer='Berliner', score=5)]

In [19]:
spark.createDataFrame(a, ['drinker', 'beer', 'score']).collect()

[Row(drinker='Chris', beer='Berliner', score=5)]

In [20]:
likes = [('Chris', 'Bud'), ('Kia', 'Berliner'), ('Matt', 'ARJK')]
frequents = [('Chris', 'Bohene'), ('Kia', 'Little'), ('Oscar', 'Griff')] 

likesName=['Drinker', 'Beer']
frequentsName=['Drinker', 'Bar']

likesDF = sqlContext.createDataFrame(likes, likesName)
frequentsDF = sqlContext.createDataFrame(frequents, frequentsName)

likesDF.show()
frequentsDF.show()

+-------+--------+
|Drinker|    Beer|
+-------+--------+
|  Chris|     Bud|
|    Kia|Berliner|
|   Matt|    ARJK|
+-------+--------+

+-------+------+
|Drinker|   Bar|
+-------+------+
|  Chris|Bohene|
|    Kia|Little|
|  Oscar| Griff|
+-------+------+



# join

In [21]:
likesDF.join(frequentsDF, likesDF.Drinker == frequentsDF.Drinker, 'right').show()

+-------+--------+-------+------+
|Drinker|    Beer|Drinker|   Bar|
+-------+--------+-------+------+
|  Chris|     Bud|  Chris|Bohene|
|    Kia|Berliner|    Kia|Little|
|   NULL|    NULL|  Oscar| Griff|
+-------+--------+-------+------+



# full join

In [22]:
likesDF.join(frequentsDF, likesDF.Drinker == frequentsDF.Drinker, 'full').show()

+-------+--------+-------+------+
|Drinker|    Beer|Drinker|   Bar|
+-------+--------+-------+------+
|  Chris|     Bud|  Chris|Bohene|
|    Kia|Berliner|    Kia|Little|
|   Matt|    ARJK|   NULL|  NULL|
|   NULL|    NULL|  Oscar| Griff|
+-------+--------+-------+------+



# left_anti

In [23]:
likesDF.join(frequentsDF, likesDF.Drinker == frequentsDF.Drinker, 'left_anti').show()

+-------+----+
|Drinker|Beer|
+-------+----+
|   Matt|ARJK|
+-------+----+



In [24]:
likesDF.count()

3

In [25]:
df = spark.createDataFrame([('a', 1), ('b', 1), ('b', 1), ('a', 2)], ('id', 'c'))
df.show()
df.distinct().show()

+---+---+
| id|  c|
+---+---+
|  a|  1|
|  b|  1|
|  b|  1|
|  a|  2|
+---+---+

+---+---+
| id|  c|
+---+---+
|  a|  1|
|  b|  1|
|  a|  2|
+---+---+



In [26]:
df = spark.createDataFrame([('a', 1), ('b', 1), ('b', 1), ('a', 2)], ('id', 'c'))
df.show()
rdd = df.rdd
print(rdd.collect())

print(df.rdd.map(list).collect())
print(df.rdd.map(tuple).collect())

+---+---+
| id|  c|
+---+---+
|  a|  1|
|  b|  1|
|  b|  1|
|  a|  2|
+---+---+

[Row(id='a', c=1), Row(id='b', c=1), Row(id='b', c=1), Row(id='a', c=2)]
[['a', 1], ['b', 1], ['b', 1], ['a', 2]]
[('a', 1), ('b', 1), ('b', 1), ('a', 2)]


# withColumn

In [27]:
df = spark.createDataFrame([['a'], ['b'], ['b'], ['c']], (['word']))
df.show()

from pyspark.sql.functions import lit
new_df=df.withColumn("COUNT", lit(1))

new_df.show()

+----+
|word|
+----+
|   a|
|   b|
|   b|
|   c|
+----+

+----+-----+
|word|COUNT|
+----+-----+
|   a|    1|
|   b|    1|
|   b|    1|
|   c|    1|
+----+-----+



# groupBy

In [28]:
from pyspark.sql import functions as func
new_df.groupBy("word").agg(func.sum("COUNT")).show()


+----+----------+
|word|sum(COUNT)|
+----+----------+
|   a|         1|
|   b|         2|
|   c|         1|
+----+----------+



In [29]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

l = [('Alice', 25), ('Robert', 12), ('Chris', 45)]
df = sqlContext.createDataFrame(l, ['Name', 'Age'])
df.show()


maturity_udf = udf(lambda age: "Adult" if age >=18 else "Child", StringType())

newdf=df.withColumn("Maturity", maturity_udf(df.Age))
newdf.show()

+------+---+
|  Name|Age|
+------+---+
| Alice| 25|
|Robert| 12|
| Chris| 45|
+------+---+

+------+---+--------+
|  Name|Age|Maturity|
+------+---+--------+
| Alice| 25|   Adult|
|Robert| 12|   Child|
| Chris| 45|   Adult|
+------+---+--------+



In [30]:
df.orderBy("Age", ascending=False).limit(1).show()

+-----+---+
| Name|Age|
+-----+---+
|Chris| 45|
+-----+---+



In [31]:
# generate some data to demonstrate 
# mat = np.arange(100).reshape(10, -1)
import numpy as np
mat = np.random.rand(8,1).reshape(4, -1)

rdd = sc.parallelize(mat)


print(rdd.collect())

rdd.reduce(lambda x, y: np.add(x, y))



[array([0.66307118, 0.46068421]), array([0.43310656, 0.22898337]), array([0.23904133, 0.22017266]), array([0.59881599, 0.91643094])]


array([1.93403506, 1.82627119])

In [32]:

from pyspark.ml.linalg import Vectors

size=2

data = [(0, Vectors.dense(np.random.rand(size)),),
        (1, Vectors.dense(np.random.rand(size)),),
        (1, Vectors.dense(np.random.rand(size)),),
        (0, Vectors.dense(np.random.rand(size)),)]

df = spark.createDataFrame(data, ["label", "features"])
df.show()

# a = df.rdd.map(lambda x: x[1]).reduce(lambda x,y: x + y )
# print(a)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[0.27848261644784...|
|    1|[0.70812183098069...|
|    1|[0.33874125055466...|
|    0|[0.61040311135081...|
+-----+--------------------+



In [33]:
a = Vectors.dense(np.round(np.random.rand(size), 2))
b = Vectors.dense(np.round(np.random.rand(size), 2))

print(a)
print(b)


np.add(a , b) 

[0.72,0.9]
[0.66,0.15]


array([1.38, 1.05])

In [34]:
np.random.sample([0, 1])

array([], shape=(0, 1), dtype=float64)