In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
import pandas as pd
import numpy as np

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [3]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [4]:
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
+---+-----+
only showing top 5 rows



In [5]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [6]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [7]:
dfp1 = df.n + 1
df.select(dfp1).show(5)

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
+-------+
only showing top 5 rows



In [8]:
df.describe().select(['summary', 'n']).show()

+-------+-----------------+
|summary|                n|
+-------+-----------------+
|  count|               20|
|   mean|              9.5|
| stddev|5.916079783099616|
|    min|                0|
|    max|               19|
+-------+-----------------+



In [10]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(15)

+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|        a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|        a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|        a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|        a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|        a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual(m6)|  4| 20| 28|  p|compact|
|        aud

In [17]:
mpg.select('model',mpg.manufacturer.alias('manuf'),'hwy').show()

+------------------+---------+---+
|             model|    manuf|hwy|
+------------------+---------+---+
|                a4|     audi| 29|
|                a4|     audi| 29|
|                a4|     audi| 31|
|                a4|     audi| 30|
|                a4|     audi| 26|
|                a4|     audi| 26|
|                a4|     audi| 27|
|        a4 quattro|     audi| 26|
|        a4 quattro|     audi| 25|
|        a4 quattro|     audi| 28|
|        a4 quattro|     audi| 27|
|        a4 quattro|     audi| 25|
|        a4 quattro|     audi| 25|
|        a4 quattro|     audi| 25|
|        a4 quattro|     audi| 25|
|        a6 quattro|     audi| 24|
|        a6 quattro|     audi| 25|
|        a6 quattro|     audi| 23|
|c1500 suburban 2wd|chevrolet| 20|
|c1500 suburban 2wd|chevrolet| 15|
+------------------+---------+---+
only showing top 20 rows



In [35]:
avg_mpg = (((mpg.cty + mpg.hwy) / 2).alias('avg_mpg'))
#avg_mpg.show()


In [36]:
mpg.select('*', avg_mpg).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|avg_mpg|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|   23.5|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|   25.0|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|   25.5|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|   25.5|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|   21.0|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|   22.0|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|   22.5|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compa