In [6]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder\
                    .master("local")\
                    .appName('abc')\
                    .getOrCreate()

In [2]:
# do something to prove it works
rdd = spark.parallelize(range(1000))
rdd.takeSample(False, 5)

AttributeError: 'SparkSession' object has no attribute 'parallelize'

## Read a file schema

In [8]:
df = spark.read.format("json")\
        .load("flight-data/2015-summary.json")

In [10]:
spark.read.format("json")\
        .load("flight-data/2015-summary.json").schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [13]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
                            StructField("DEST_COUNTRY_NAME", StringType(), True),
                            StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
                            StructField("count", LongType(), False)
                            ])
df = spark.read.format("json")\
                .schema(myManualSchema)\
                .load("flight-data/2015-summary.json")

## Invoking columns

In [19]:
from pyspark.sql.functions import expr, col

In [17]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

## Invoking Rows

In [20]:
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

## Creating DataFrames

In [22]:
df.createGlobalTempView("dfTable")

In [30]:
df.show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 10 rows



In [31]:
df.select("DEST_COUNTRY_NAME").show(10)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
|    United States|
|            Egypt|
|    United States|
|    United States|
|    United States|
|       Costa Rica|
|          Senegal|
|          Moldova|
+-----------------+
only showing top 10 rows



In [32]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [35]:
df.selectExpr( "DEST_COUNTRY_NAME as newColumnName","DEST_COUNTRY_NAME")\
            .show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [36]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(10)

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



### Spark literals

In [40]:
from pyspark.sql.functions import lit


df.select(
        expr("*"),
        lit(1).alias("something")
        ).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|something|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



## Adding Columns

In [41]:
df.withColumn(
            "withinCountry",
            expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")
            ).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [43]:
#renaming columns

df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

In [44]:
# remove columns
df.drop("ORIGIN_COUNTRY_NAME").columns

['DEST_COUNTRY_NAME', 'count']

In [47]:
# multiple columns
df.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME")

DataFrame[count: bigint]

## Filtering Rows

In [49]:
colCondition = df.filter(col("count") < 2).take(2)

In [54]:
conditional = df.where("count < 2").where(col("ORIGIN_COUNTRY_NAME") != "Croatia").take(200)

In [55]:
conditional

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Malta', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Gibraltar', count=1),
 Row(DEST_COUNTRY_NAME='Saint Vincent and the Grenadines', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Suriname', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Cyprus', count=1),
 Row(DEST_COUNTRY_NAME='Burkina Faso', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Djibouti', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Estonia', count=1),
 Row(DEST_COUNTRY_NAME='Zambia', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='Cyprus', ORIGIN_COUNTRY_NAME='United States', count=1

## Getting unique rows

In [56]:
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

125

In [58]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|              Guyana|      United States|   64|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeria|      United States|    4|
|Turks and Caicos ...|      United States|  230|
|Saint Vincent and...|      United States|    1|
|       United States|             Russia|  161|
|            Pakistan|      United States|   12|
|    Marshall Islands|      United States|   42|
|            Honduras|      United States|  362|
|       United States|            Senegal|   42|
|           Hong Kong|      United States|  332|
| Trinidad and Tobag

## Random Splits

In [59]:
dataFrames = df.randomSplit([0.25, 0.75], seed)
dataFrames[0].count() > dataFrames[1].count()

False

In [61]:
dataFrames[0].sample(False,0.05).show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|British Virgin Is...|      United States|  107|
|       United States|             Angola|   13|
|       United States|         Azerbaijan|   21|
+--------------------+-------------------+-----+



## Concatenating and Appending DataFrames

In [67]:
from pyspark.sql import Row

schema = df.schema
newRows = [Row("New Country", "Other Country", 5),Row("New Country 2", "Other Country 3", 1)]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [68]:
newDF.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|      New Country|      Other Country|    5|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



In [72]:
df.union(newDF).filter(col("count")==1).filter(col("DEST_COUNTRY_NAME")!="United States").show()
#     .where("count = 1")\
#     .where(col("ORIGIN_COUNTRY_NAME") != "United States")\
#     .show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|             Moldova|      United States|    1|
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|            Suriname|      United States|    1|
|        Burkina Faso|      United States|    1|
|            Djibouti|      United States|    1|
|              Zambia|      United States|    1|
|              Cyprus|      United States|    1|
|       Cote d'Ivoire|      United States|    1|
|              Kosovo|      United States|    1|
|                Iraq|      United States|    1|
|           Indonesia|      United States|    1|
|       New Caledonia|      United States|    1|
|       New Country 2|    Other Country 3|    1|
+--------------------+-------------------+-----+



## Sorting

In [73]:
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--

In [76]:
df.sortWithinPartitions("count").show(100)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Croatia|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
|               Malta|      United States|    1|
|       United States|          Gibraltar|    1|
|Saint Vincent and...|      United States|    1|
|            Suriname|      United States|    1|
|       United States|             Cyprus|    1|
|        Burkina Faso|      United States|    1|
|            Djibouti|      United States|    1|
|       United States|            Estonia|    1|
|              Zambia|      United States|    1|
|              Cyprus|      United States|    1|
|       United States|          Lithuania|    1|
|       United States|           Bulgaria|    1|
|       United States|            Georgia|    1|
|       United States|            Bahrain|    1|
|       Cote d'Ivoir

In [78]:
## working with only 5 rows

df.limit(5).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+

