In [1]:
from pyspark import SparkContext
from pyspark.sql import Row
import pyspark

## Loading Spark Context

In [2]:
sc = SparkContext("local", "App Name")

In [3]:
sc

## Create RDD from Iris data

In [4]:
rdd = sc.textFile('iris.data.txt')

In [5]:
rdd.first()

'5.1,3.5,1.4,0.2,Iris-setosa'

In [6]:
lines=rdd.map(lambda x:x.split(','))

In [7]:
lines.first()

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']

In [8]:
parsedLines=lines.map(lambda x:Row(sepalLength=float(x[0]), sepalWidth=float(x[1]), petalLength=float(x[2]), 
                                   petalWidth=float(x[3]), species=x[4]))

In [9]:
parsedLines.first()

Row(petalLength=1.4, petalWidth=0.2, sepalLength=5.1, sepalWidth=3.5, species='Iris-setosa')

In [10]:
parsedLines.first()['species']

'Iris-setosa'

In [11]:
del rdd
del lines

## Create an Unstructured RDD

In [12]:
sc.parallelize([Row(test=1), Row(test1='a', test2=11)]).collect()

[Row(test=1), Row(test1='a', test2=11)]

## Turn RDD into DataFrame

In [13]:
from pyspark.sql.types import StructField, StructType, FloatType, StringType

In [14]:
schema=StructType([
    StructField('petalLength', FloatType()),
    StructField('petalWidth', FloatType()),
    StructField('sepalLength', FloatType()),
    StructField('sepalWidth', FloatType()),
    StructField('species', StringType())
])

In [15]:
sqlContext=pyspark.sql.SQLContext(sc,)

In [16]:
df=sqlContext.createDataFrame(parsedLines, schema=schema)

In [17]:
df.first()

Row(petalLength=1.399999976158142, petalWidth=0.20000000298023224, sepalLength=5.099999904632568, sepalWidth=3.5, species='Iris-setosa')

In [18]:
df.head(4)

[Row(petalLength=1.399999976158142, petalWidth=0.20000000298023224, sepalLength=5.099999904632568, sepalWidth=3.5, species='Iris-setosa'),
 Row(petalLength=1.399999976158142, petalWidth=0.20000000298023224, sepalLength=4.900000095367432, sepalWidth=3.0, species='Iris-setosa'),
 Row(petalLength=1.2999999523162842, petalWidth=0.20000000298023224, sepalLength=4.699999809265137, sepalWidth=3.200000047683716, species='Iris-setosa'),
 Row(petalLength=1.5, petalWidth=0.20000000298023224, sepalLength=4.599999904632568, sepalWidth=3.0999999046325684, species='Iris-setosa')]

In [19]:
df.schema

StructType(List(StructField(petalLength,FloatType,true),StructField(petalWidth,FloatType,true),StructField(sepalLength,FloatType,true),StructField(sepalWidth,FloatType,true),StructField(species,StringType,true)))

## Read Mongo collection into Dataframe

In [32]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
     .master("local") \
     .appName("Word Count") \
     .getOrCreate()


In [41]:
df = spark.read.format('com.mongodb.spark.sql.DefaultSource').option('uri', 'mongodb://127.0.0.1/packt.testCollection').load()

Py4JJavaError: An error occurred while calling o368.load.
: java.lang.ClassNotFoundException: Failed to find data source: com.mongodb.spark.sql.DefaultSource. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:635)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:190)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:164)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.mongodb.spark.sql.DefaultSource.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23$$anonfun$apply$15.apply(DataSource.scala:618)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23$$anonfun$apply$15.apply(DataSource.scala:618)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23.apply(DataSource.scala:618)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23.apply(DataSource.scala:618)
	at scala.util.Try.orElse(Try.scala:84)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:618)
	... 13 more
