In [None]:
from datetime import datetime, timedelta

# pandas and plotting libraries for visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# module containing functions for manipulation pyspark dataframes
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window
from pyspark.sql import DataFrame

# class which will let us create spark objects
from pyspark.sql import SparkSession

# helper functions for the class
from helpers import display, read_df, write_df, check_split

## Create a Spark Session

In [None]:
spark = (
    SparkSession
    .builder
    .appName('debugging')
    .master('local[2]')
    .getOrCreate()
)

## Read in data

In [None]:
df = read_df(spark, '../taxi_2016')

## Memory Fun

In [None]:
df.toPandas() 

In [None]:
help(display)

## Stacktraces

In [None]:
display(df.groupby('company').count().orderby('count'))

----

In [None]:
df.select('column that does not exist')

----

In [None]:
census_block_window = Window().partitionBy('dropoff_census_tract')
next_step = (
    df
    .groupby('taxi_id')
    .agg(
        (
            f.sum(f.col('payment_type') == 'Credit Card')
            / f.sum(f.col('payment_type') == 'Cash')
        ).alias('payment_type_ratio')
    )
)

----

In [None]:
display(df.agg(max('trip_miles')))

## Unexpected Results

In [None]:
cash = (
    df
    .where(f.col('payment_type') == 'Cash')
    .select('taxi_id', f.lit('cash').alias('type'), f.lit(3).alias('tips'))
)

In [None]:
credit = (
    df
    .where(f.col('payment_type') == 'Credit Card')
    .select('taxi_id', 'tips', f.lit('credit').alias('type'))
)

In [None]:
filled_in_data = cash.union(credit)

In [None]:
display(filled_in_data.agg(f.min('tips')))

In [None]:
display(filled_in_data.where('type=="cash"').agg(f.min('tips')))

In [None]:
display(filled_in_data.where('type=="credit"').agg(f.min('tips')))

----

In [None]:
unif = f.pandas_udf(lambda x: pd.Series(np.random.uniform(size=x.shape)), returnType=t.FloatType())

In [None]:
df_with_rnd = df.withColumn('rnd', unif('trip_id'))

In [None]:
display(df_with_rnd)

In [None]:
train = df_with_rnd.where('rnd <= 0.8')
test = df_with_rnd.where('rnd > 0.8')

In [None]:
check_split(train, test, df) # checks that the count of train + count of test equals the count of df

In [None]:
train.explain()

In [None]:
train, test = df.randomSplit([0.3, 0.7])

In [None]:
check_split(train, test, df)

In [None]:
train.explain()

In [None]:
test.explain()

In [None]:
spark.stop()