## Check the Spark session 
to ensure it is running an see how many nodes are running in the cluser.

In [1]:
# starting up the spark context using command 'pyspark --master local[#ofNodes]'
spark  # also achieved by through the "sc" object

## Run the "Hello World" code

In [2]:
df = spark.sql('''SELECT 'spark' as hello ''') 
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



## Basic file loading and usage

In [3]:
textFile = spark.read.text("C:\\Spark\\README.md")
# textFile
textFile.count()

105

In [4]:
textFile.first()

Row(value='# Apache Spark')

In [5]:
lineswithSpark = textFile.filter(textFile.value.contains("Spark"))
lineswithSpark.count()

20

## Introductory code based on Pluralsight course examples

## Basic Operations in the SQL Context wrapper
Some manipulations that handled differently between RDD's and DataFrames.

In [6]:
sqlContext = SQLContext(sc)
dfrange = sqlContext.range(5)
dfrange.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [7]:
simple_data2 = [(1, "Alice", 50),
                (2, "Bob", 80),
                (3, "Charlee", 75)]
sqlContext.createDataFrame(simple_data2, ["ID", "Name", "Score"]).show()

+---+-------+-----+
| ID|   Name|Score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   80|
|  3|Charlee|   75|
+---+-------+-----+



In [8]:
from pyspark.sql.types import Row
data1 = sc.parallelize([
    Row(1, "Alice", 50),
    Row(2, "Bob", 80),
    Row(3, "Charlee", 75)])
col_names = Row('id', 'name', 'score')
students = data1.map(lambda r: col_names(*r))
students.collect()

[Row(id=1, name='Alice', score=50),
 Row(id=2, name='Bob', score=80),
 Row(id=3, name='Charlee', score=75)]

In [9]:
students_df = sqlContext.createDataFrame(students)
students_df.show()

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   80|
|  3|Charlee|   75|
+---+-------+-----+



In [10]:
students_df.collect()[1][2]

80

In [11]:
students_df.select('name', 'score').show()

+-------+-----+
|   name|score|
+-------+-----+
|  Alice|   50|
|    Bob|   80|
|Charlee|   75|
+-------+-----+



In [12]:
students_df.select('score').withColumn('final score', students_df.score + 5.5).show()

+-----+-----------+
|score|final score|
+-----+-----------+
|   50|       55.5|
|   80|       85.5|
|   75|       80.5|
+-----+-----------+



In [13]:
students_df.withColumnRenamed("score", "old score").show()

+---+-------+---------+
| id|   name|old score|
+---+-------+---------+
|  1|  Alice|       50|
|  2|    Bob|       80|
|  3|Charlee|       75|
+---+-------+---------+



In [14]:
students_df.select(students_df.score.alias('old score')).show()

+---------+
|old score|
+---------+
|       50|
|       80|
|       75|
+---------+



In [15]:
import pandas
students_pandas = students_df.toPandas()
students_pandas

Unnamed: 0,id,name,score
0,1,Alice,50
1,2,Bob,80
2,3,Charlee,75


In [16]:
df_spark = sqlContext.createDataFrame(students_pandas).show()
df_spark

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   80|
|  3|Charlee|   75|
+---+-------+-----+



In [17]:
students_df.show()  # the original DataFrame is never changed. All transformations create new DataFrames.

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   80|
|  3|Charlee|   75|
+---+-------+-----+



## Loading a CSV File

In [21]:
filePath = "C:\\Users\\joshs\\source\\PycharmProjects\\DataScience\\Courses&InputData\\spark-2-getting-started\\02\\demos\\datasets\\london_crime_by_lsoa.csv"
data = spark.read.format('csv').option('header', 'true').load(filePath)
data.take(10)

[Row(lsoa_code='E01001116', borough='Croydon', major_category='Burglary', minor_category='Burglary in Other Buildings', value='0', year='2016', month='11'),
 Row(lsoa_code='E01001646', borough='Greenwich', major_category='Violence Against the Person', minor_category='Other violence', value='0', year='2016', month='11'),
 Row(lsoa_code='E01000677', borough='Bromley', major_category='Violence Against the Person', minor_category='Other violence', value='0', year='2015', month='5'),
 Row(lsoa_code='E01003774', borough='Redbridge', major_category='Burglary', minor_category='Burglary in Other Buildings', value='0', year='2016', month='3'),
 Row(lsoa_code='E01004563', borough='Wandsworth', major_category='Robbery', minor_category='Personal Property', value='0', year='2008', month='6'),
 Row(lsoa_code='E01001320', borough='Ealing', major_category='Theft and Handling', minor_category='Other Theft', value='0', year='2012', month='5'),
 Row(lsoa_code='E01001342', borough='Ealing', major_category=

In [22]:
data.count()

13490604

In [None]:
# data.collect()  # prints out all the data. don't do this with large datasets