In [1]:
! pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /opt/conda/lib/python3.6/site-packages
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
! pip install lightning-python

[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import matplotlib.pyplot as plt 
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.functions import when
from pyspark import SparkContext as sc
from pyspark.sql.functions import col, split, ltrim, substring
import pyspark.sql as SQL
from pyspark.sql.functions import *
import datetime
import calendar
import pandas as pd
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [4]:
conf = SparkConf().setAppName("Jan-01").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [5]:
spark = SparkSession.builder.appName('Jan-01').getOrCreate()

In [49]:
# Download and decompress data into your Jupyter environment; abreviated jan 2017 data
jan_2017 = spark.read.format("csv").load('yellow_tripdata_half.csv', header = True)
#jan_2017.count()

In [50]:
#need to get two dataframes to merge on, or else get cartesian product error
taxi_zone = spark.read.format("csv").load('taxi+_zone_lookup.csv', header = True)

In [51]:
#merging to get destination information
jan_2017 = jan_2017.join(taxi_zone, jan_2017.PULocationID == taxi_zone.LocationID, "left_outer"). \
                withColumnRenamed("Borough", "PUBorough").withColumnRenamed("Zone", "PUZone").withColumnRenamed("service_zone", "PUServiceZone").\
                withColumnRenamed("neighborhood", "PUneighbor").cache()
    

In [52]:
#make unique ID
jan_2017 = jan_2017.withColumn("uniqueIdColumn", monotonically_increasing_id())

In [53]:
jan_2017 = jan_2017.drop("LocationID")

In [54]:
#encoding if pickup is an aiport
jan_2017 = jan_2017.withColumn("AirportPU", \
                               F.when((jan_2017["PULocationID"] == '138' ) | \
                                      (jan_2017["PULocationID"] == '132') |\
                                      (jan_2017["PULocationID"] == '1'),1).otherwise(0))

In [55]:
#cleaning data
jan_2017 = jan_2017.where((jan_2017['PUBorough'] != 'Unknown'))

In [56]:
#splitting up time and date
split_pickup_col = split(jan_2017['tpep_pickup_datetime'], ' ')
jan_2017 = jan_2017.withColumn("PUDate", split_pickup_col.getItem(0).cast(DateType()))
jan_2017 = jan_2017.withColumn("PUTime", split_pickup_col.getItem(1))

In [57]:
#splitting time into hour and minute; will round minute to nearest 5 minutes
split_PUTime = split(jan_2017['PUTime'], ':')
jan_2017 = jan_2017.withColumn("PUHour", split_PUTime.getItem(0).cast(IntegerType()))
jan_2017 = jan_2017.withColumn("PUMinute", split_PUTime.getItem(1).cast(IntegerType()))

In [58]:
#rush hour
jan_2017 = jan_2017.withColumn("MorningRushHour", \
                               F.when((jan_2017["PUHour"] >= 6 ) & \
                                      (jan_2017["PUHour"] < 9),1).otherwise(0))

In [59]:
jan_2017 = jan_2017.withColumn("EveningRushHour", \
                               F.when((jan_2017["PUHour"] >= 17 ) & \
                                      (jan_2017["PUHour"] < 21),1).otherwise(0))

In [60]:
jan_2017 = jan_2017.withColumn("PUDay", dayofyear(jan_2017.PUDate))

In [61]:
#rounding down mintue to closest 5 minute mark (computationally easier)
#jan_2017 = jan_2017.withColumn("DOMinute", (jan_2017.DOMinute - jan_2017.DOMinute%5))
jan_2017 = jan_2017.withColumn("PUMinute", (jan_2017.PUMinute - jan_2017.PUMinute%5))

In [62]:
#DOW gives you 1 (Monday) - 7 (Sunday)
jan_2017 = jan_2017.withColumn("PU_DOW",  date_format(jan_2017.PUDate, 'u').cast(ShortType()))
#jan_2017 = jan_2017.withColumn("DO_DOW",  date_format(jan_2017.DODate, 'u').cast(ShortType()))

In [63]:
#encoding if destination is a weekend
jan_2017 = jan_2017.withColumn("Weekend", \
                               F.when((jan_2017["PU_DOW"] == 7) | \
                                      (jan_2017["PU_DOW"] == 6),1).otherwise(0))

In [64]:
jan_2017 = jan_2017.withColumn("WorkingHour", \
                               F.when((((jan_2017["PUHour"] >= 9 ) & (jan_2017["PUHour"] < 17))\
                                       & (jan_2017["Weekend"] == 0)) ,1).otherwise(0))

In [65]:
#casting data types to primitives

#1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.
jan_2017 = jan_2017.withColumn("VendorID", jan_2017["VendorID"].cast(ShortType()))

jan_2017 = jan_2017.withColumn("passenger_count", jan_2017["passenger_count"].cast(ShortType()))

#in miles
jan_2017 = jan_2017.withColumn("trip_distance", jan_2017["trip_distance"].cast(FloatType()))

#1= Credit card
#2= Cash
#3= No charge
#4= Dispute
#5= Unknown
#6= Voided trip
jan_2017 = jan_2017.withColumn("payment_type", jan_2017["payment_type"].cast(ShortType()))
jan_2017 = jan_2017.withColumn("fare_amount", jan_2017["fare_amount"].cast(FloatType()))

#0.50 and $1 rush hour and overnight charges.
jan_2017 = jan_2017.withColumn("extra", jan_2017["extra"].cast(FloatType()))
#.50, automatic MTA charge
jan_2017 = jan_2017.withColumn("mta_tax", jan_2017["mta_tax"].cast(FloatType()))


jan_2017 = jan_2017.withColumn("tip_amount", jan_2017["tip_amount"].cast(FloatType()))
jan_2017 = jan_2017.withColumn("tolls_amount", jan_2017["tolls_amount"].cast(FloatType()))
jan_2017 = jan_2017.withColumn("improvement_surcharge", jan_2017["improvement_surcharge"].cast(FloatType()))
jan_2017 = jan_2017.withColumn("total_amount", jan_2017["total_amount"].cast(FloatType()))


jan_2017 = jan_2017.withColumn("RateCodeID", jan_2017["RateCodeID"].cast(ShortType()))
#1= Standard rate
#2=JFK -> $52 flat fare
#3=Newark
#4=Nassau or Westchester
#5=Negotiated fare
#6=Group ride

In [66]:
#basic fare cleaning, ensure that all values are above zero
jan_2017 = jan_2017.filter(jan_2017.tip_amount >= 0)

In [67]:
#basic fare cleaning, ensure that all values are above zero
jan_2017 = jan_2017.filter(jan_2017.tolls_amount >= 0.0) 

In [68]:
#basic fare cleaning, ensure that all values are above zero
jan_2017 = jan_2017.filter(jan_2017.total_amount >= 3.30)

In [69]:
#basic fare cleaning, ensure that all values are above zero
jan_2017 = jan_2017.filter(jan_2017.extra >= 0.00)

In [70]:
#minimum fare amounts according to NYC Taxi data standards
jan_2017 = jan_2017.filter((jan_2017.fare_amount >= 2.50))

In [71]:
#minimum fare amounts according to NYC Taxi data standards
jan_2017 = jan_2017.filter(jan_2017.improvement_surcharge >= 0.3)

In [72]:
#minimum fare amounts according to NYC Taxi data standards
jan_2017 = jan_2017.filter(jan_2017.mta_tax >= 0.5)

In [73]:
jan_2017 = jan_2017.withColumn("PLocationID", jan_2017.PULocationID.cast(IntegerType())).drop("PULocationID")
jan_2017 = jan_2017.withColumn("DLocationID", jan_2017.DOLocationID.cast(IntegerType())).drop("DOLocationID")
#sampe.printSchema()


In [74]:
# jan_2017.printSchema()
jan_2017 = jan_2017.drop('tpep_pickup_datetime')
jan_2017 = jan_2017.drop('tpep_dropoff_datetime')

In [75]:
jan_2017 = jan_2017.drop('payment_type')
jan_2017 = jan_2017.drop('fare_amount')
jan_2017 = jan_2017.drop('extra')
jan_2017 = jan_2017.drop('mta_tax')
jan_2017 = jan_2017.drop('tip_amount')
jan_2017 = jan_2017.drop('tolls_amount')
jan_2017 = jan_2017.drop('improvement_surcharge')
jan_2017 = jan_2017.drop('total_amount')

In [76]:
jan_2017 = jan_2017.drop('trip_distance')
jan_2017 = jan_2017.drop('store_and_fwd_flag')

In [77]:
#load weather data for merging
weather_data = spark.read.load('weather.txt', format="text")

In [78]:
weather_data.createOrReplaceTempView('weather_data_sdf')

In [79]:
weather_data = spark.sql('SELECT CAST(split(value, ",")[0] as string) AS date, '\
                        'CAST(split(value, ",")[1] as string) as time, '\
                        'CAST(split(value, ",")[2] as float) as temp, '\
                        'CAST(split(value, ",")[3] as float) as windchill, '\
                        'CAST(split(value, ",")[4] as float) as dewpoint, '\
                        'CAST(split(value, ",")[5] as float) as humidity, '\
                        'CAST(split(value, ",")[6] as float) as pressure, '\
                        'CAST(split(value, ",")[7] as float) as visibility, '\
                        'CAST(split(value, ",")[8] as string) as windDir, '\
                        'CAST(split(value, ",")[9] as float) as windSpeed, '\
                        'CAST(split(value, ",")[10] as float) as gustSpeed, '\
                        'CAST(split(value, ",")[11] as float) as Precip, '\
                        'CAST(split(value, ",")[12] as string) as Events, '\
                        'CAST(split(value, ",")[13] as string) as Conditions '\
                         'FROM weather_data_sdf')

In [80]:
#cast date to date type
weather_data = weather_data.withColumn("date", weather_data.date.cast(DateType()))

In [81]:
def period(x):
    return split(split(x, ':')[1], " ")[1]

In [82]:
def toHour(x):
    first_split = split(x, ':')
    retval = first_split[0].cast(IntegerType()) % 12
    return retval 

In [83]:
#get am or pm
weather_data = weather_data.withColumn("period", period("time"))

In [84]:
#make hour military time
weather_data = weather_data.withColumn("hour", when(weather_data.period == 'PM', toHour("time") + 12).otherwise(toHour("time")))

In [85]:
# #fill any nulls
weather_data = weather_data.na.fill(0)

In [43]:
#make temporary views for joining
# weather_data.createOrReplaceTempView('weather_data_sdf')

# weather_data_pu = spark.sql('SELECT date AS PUTempdate, '\
#                             'time as PUTemptime, ' \
#                             'temp as PUtemp, '\
#                             'windchill as PUwindchill, '\
#                             'dewpoint as PUdewpoint, '\
#                             'pressure as PUpressure, '\
#                             'visibility as PUvisibility, '\
#                             'windDir as PUwindDir, '\
#                             'gustSpeed as PUgustSpeed, '\
#                             'Precip as PUPrecip, '\
#                             'Events as PUEvents, '\
#                             'Conditions as PUConditions, '\
#                             'period as PUperiod, '\
#                             'hour as PUTemphour '\
#                             'FROM weather_data_sdf')


In [86]:
weather_data.printSchema()
weather_data_pu.printSchema()
jan_2017.printSchema()

root
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- temp: float (nullable = false)
 |-- windchill: float (nullable = false)
 |-- dewpoint: float (nullable = false)
 |-- humidity: float (nullable = false)
 |-- pressure: float (nullable = false)
 |-- visibility: float (nullable = false)
 |-- windDir: string (nullable = true)
 |-- windSpeed: float (nullable = false)
 |-- gustSpeed: float (nullable = false)
 |-- Precip: float (nullable = false)
 |-- Events: string (nullable = true)
 |-- Conditions: string (nullable = true)
 |-- period: string (nullable = true)
 |-- hour: integer (nullable = true)

root
 |-- PUTempdate: date (nullable = true)
 |-- PUTemptime: string (nullable = true)
 |-- PUtemp: float (nullable = false)
 |-- PUwindchill: float (nullable = false)
 |-- PUdewpoint: float (nullable = false)
 |-- PUpressure: float (nullable = false)
 |-- PUvisibility: float (nullable = false)
 |-- PUwindDir: string (nullable = true)
 |-- PUgustSpeed: float (nullable 

In [87]:
jan_2017 = jan_2017.join(weather_data, (jan_2017.PUDate == weather_data.date) & \
                         (jan_2017.PUHour == weather_data.hour), "left_outer")

In [88]:
jan_2017 = jan_2017.dropDuplicates(['uniqueIdColumn'])

In [89]:
#extra, payment type, fare amount, mta_tax, tip_amount, tollsamount, total_amount, improvement surcharge

# Categorical Features
# RateCodeID
# store_and_fwd_flag
# PULocationID
# DOLocationID
# LocationID (1 to 256)
# PUBorough (comes from taxi+_lookup_zone)
# PUZone (Name for Location ID)
# PUServiceZone (Categorical)
# PUNeighbor (Demographics Neighborhood)
# PUDay (1-365)
# PU_DOW (Day of week)
# PUEvents
# PUConditions
# PUPeriod (AM or PM)

jan_2017 = jan_2017.drop('PUDate')
jan_2017 = jan_2017.drop('PUTime')
jan_2017 = jan_2017.drop('date')
jan_2017 = jan_2017.drop('time')

In [90]:
jan_2017 = jan_2017.drop('hour')


In [91]:
PUdemographics = spark.read.format("csv").load('demographics.csv', header = True).cache()


In [92]:
PUnames = PUdemographics.schema.names
i = 0
for name in PUnames:
    if (i != 0):
        PUdemographics = PUdemographics.withColumn("PU" + name, col(name).cast(FloatType())).drop(name)
    i += 1

In [93]:
#PUdemographics.printSchema()

In [94]:
# One hot encoding categorical variables


In [95]:
jan_2017 = jan_2017.join(PUdemographics, jan_2017.PUneighbor == PUdemographics.neighborhood, "left_outer")
jan_2017 = jan_2017.dropDuplicates(['uniqueIdColumn'])
jan_2017 = jan_2017.drop('neighborhood')

In [51]:
def one_hot(input_sdf, col_name):
    if (col_name == "PUZone"):
        return input_sdf
    else:
        i = 0
        col_vals = input_sdf.select(col_name).distinct().rdd.flatMap(lambda x: x).collect()
        for val in col_vals:
            i += 1
            input_sdf = input_sdf.withColumn("{0}_is_{1}".format(col_name, val), \
                                           F.when(input_sdf[col_name] == val, 1).otherwise(0))
        return input_sdf

In [96]:
jan_2017.printSchema()
# Categorical Features
# RateCodeID
# store_and_fwd_flag
# PULocationID
# DOLocationID
# LocationID (1 to 256)
# PUBorough (comes from taxi+_lookup_zone)
# PUZone (Name for Location ID)
# PUServiceZone (Categorical)
# PUNeighbor (Demographics Neighborhood)
# PUDay (1-365)
# PU_DOW (Day of week)
# PUEvents
# PUConditions
# PUPeriod (AM or PM)
#jan_2017.printSchema()

root
 |-- VendorID: short (nullable = true)
 |-- passenger_count: short (nullable = true)
 |-- RateCodeID: short (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- PUZone: string (nullable = true)
 |-- PUServiceZone: string (nullable = true)
 |-- PUneighbor: string (nullable = true)
 |-- uniqueIdColumn: long (nullable = false)
 |-- AirportPU: integer (nullable = false)
 |-- PUHour: integer (nullable = true)
 |-- PUMinute: integer (nullable = true)
 |-- MorningRushHour: integer (nullable = false)
 |-- EveningRushHour: integer (nullable = false)
 |-- PUDay: integer (nullable = true)
 |-- PU_DOW: short (nullable = true)
 |-- Weekend: integer (nullable = false)
 |-- WorkingHour: integer (nullable = false)
 |-- PLocationID: integer (nullable = true)
 |-- DLocationID: integer (nullable = true)
 |-- temp: float (nullable = true)
 |-- windchill: float (nullable = true)
 |-- dewpoint: float (nullable = true)
 |-- humidity: float (nullable = true)
 |-- pressure: float (nullable = tr

In [97]:
jan_2017.count()

4925213

In [98]:
#dropping only 0.004527722963% of the data
jan_2017 = jan_2017.na.drop().cache()

In [100]:
# jan_2017.show(10)

+--------+---------------+----------+---------+--------------------+-------------+--------------------+--------------+---------+------+--------+---------------+---------------+-----+------+-------+-----------+-----------+-----------+----+---------+--------+--------+--------+----------+--------+---------+---------+------+------+----------+------+------------+-----------------+-----------+----------+-------------+-----------+--------------+----------+----------------+-----------+---------------+---------------+------------------------+----------------+-----------------+---------------------+----------------------+-------------+------------+----------+-----------+-----------+----------+------------+--------------------+------------+----------+------------+--------------+---------------+---------+----------+----------+----------------+-------------+-------------------+-----------+--------------+---------+-----------------+-----------+---------+-----------+
|VendorID|passenger_count|RateCod

In [101]:
indexer = StringIndexer(inputCol="PUZone", outputCol="PUZoneIndex")
jan_2017 = indexer.fit(jan_2017).transform(jan_2017)
encoder = OneHotEncoder(inputCol='PUZoneIndex', outputCol="PUZoneVect")
jan_2017 = encoder.transform(jan_2017).drop('PUZoneIndex')

In [102]:
encoder = OneHotEncoder(inputCol='RateCodeID', outputCol="RateCodeIDVect")
jan_2017 = encoder.transform(jan_2017).drop('RateCodeID')

In [103]:
# encoder = OneHotEncoder(inputCol='store_and_fwd_flag', outputCol="store_and_fwd_flagVect")
# jan_2017 = encoder.transform(jan_2017).drop('store_and_fwd_flag')

In [104]:
encoder = OneHotEncoder(inputCol='PLocationID', outputCol="PLocationIDVect")
jan_2017 = encoder.transform(jan_2017).drop('PLocationID')

In [105]:
# jan_2017 = one_hot(jan_2017, 'DOLocationID')
# print('done encoding DOLocationID')

In [106]:
# jan_2017 = one_hot(jan_2017, 'LocationID')
# print('done encoding LocationID')

In [107]:
indexer = StringIndexer(inputCol="PUBorough", outputCol="PUBoroughIndex")
jan_2017 = indexer.fit(jan_2017).transform(jan_2017)
encoder = OneHotEncoder(inputCol='PUBoroughIndex', outputCol="PUBoroughVect")
jan_2017 = encoder.transform(jan_2017).drop('PUBoroughIndex')

In [108]:
indexer = StringIndexer(inputCol="PUServiceZone", outputCol="PUServiceZoneIndex")
jan_2017 = indexer.fit(jan_2017).transform(jan_2017)
encoder = OneHotEncoder(inputCol='PUServiceZoneIndex', outputCol="PUServiceZoneVect")
jan_2017 = encoder.transform(jan_2017).drop('PUServiceZoneIndex')

In [109]:
# indexer = StringIndexer(inputCol="PUNeighbor", outputCol="PUNeighborIndex")
# jan_2017 = indexer.fit(jan_2017).transform(jan_2017)
# encoder = OneHotEncoder(inputCol='PUNeighborIndex', outputCol="PUNeighborVect")
# jan_2017 = encoder.transform(jan_2017).drop('PUNeighborIndex')

In [110]:
encoder = OneHotEncoder(inputCol='PU_DOW', outputCol="PU_DOWVect")
jan_2017 = encoder.transform(jan_2017).drop('PU_DOW')

In [112]:
indexer = StringIndexer(inputCol="Events", outputCol="EventsIndex")
jan_2017 = indexer.fit(jan_2017).transform(jan_2017)
encoder = OneHotEncoder(inputCol='EventsIndex', outputCol="EventsVector")
jan_2017 = encoder.transform(jan_2017).drop('EventsIndex')

In [113]:
indexer = StringIndexer(inputCol="Conditions", outputCol="ConditionsIndex")
jan_2017 = indexer.fit(jan_2017).transform(jan_2017)

encoder = OneHotEncoder(inputCol='ConditionsIndex', outputCol="ConditionsVect")
jan_2017 = encoder.transform(jan_2017).drop('ConditionsIndex')

In [115]:
indexer = StringIndexer(inputCol="period", outputCol="periodIndex")
jan_2017 = indexer.fit(jan_2017).transform(jan_2017)
encoder = OneHotEncoder(inputCol='periodIndex', outputCol="periodVect")
jan_2017 = encoder.transform(jan_2017).drop('periodIndex')

In [116]:
encoder = OneHotEncoder(inputCol='PUDay', outputCol="PUDayVect")
jan_2017 = encoder.transform(jan_2017).drop('PUDay')

In [None]:
jan_2017.printSchema()

In [130]:
indexer = StringIndexer(inputCol="windDir", outputCol="windDirIndex")
jan_2017 = indexer.fit(jan_2017).transform(jan_2017)
encoder = OneHotEncoder(inputCol='windDirIndex', outputCol="windDirVect")
jan_2017 = encoder.transform(jan_2017).drop('windDirIndex')

In [132]:
# Drop any string columns except 'PUZone'
str_names = ['PUZone','tpep_pickup_datetime','tpep_dropoff_datetime','store_and_fwd_flag','PULocationID',\
             'LocationID','PUBorough','PUServiceZone','PUneighbor','PUTime','PUTemptime',\
             'windDir','Events','Conditions','period','neighborhood','PUDate', 'PUTempdate']
for col in str_names:
    jan_2017=jan_2017.drop(col)

In [None]:
# sc = spark.sparkContext

In [None]:
# def plot_points(X, labels):
#         plt.scatter(X[:,0],X[:,1])
#         for i, txt in enumerate(labels):
#                 plt.annotate(txt, (X[i,0],X[i,1]))
#         plt.show()

In [None]:
# ## PCA!!!
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA

# # Select Labels
# labels = jan_2017['PUZone']

# # Standardize everything
# standsac = StandardScaler()
# jan_2017_std= standsac.fit_transform(jan_2017)

# # Compute PCA with 2 Principal Components
# pca = PCA(n_components=2)
# pca.fit(jan_2017_std)
# jan_2017_PCA = pca.transform(jan_2017_std)


In [153]:
# sampe = jan_2017.sample(False,0.0001,0).cache()
sampe = jan_2017

In [134]:
print(len(sampe.schema.names))

74


In [119]:
sampe.printSchema()

root
 |-- VendorID: short (nullable = true)
 |-- passenger_count: short (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- PUZone: string (nullable = true)
 |-- PUServiceZone: string (nullable = true)
 |-- PUneighbor: string (nullable = true)
 |-- uniqueIdColumn: long (nullable = false)
 |-- AirportPU: integer (nullable = false)
 |-- PUHour: integer (nullable = true)
 |-- PUMinute: integer (nullable = true)
 |-- MorningRushHour: integer (nullable = false)
 |-- EveningRushHour: integer (nullable = false)
 |-- Weekend: integer (nullable = false)
 |-- WorkingHour: integer (nullable = false)
 |-- DLocationID: integer (nullable = true)
 |-- temp: float (nullable = true)
 |-- windchill: float (nullable = true)
 |-- dewpoint: float (nullable = true)
 |-- humidity: float (nullable = true)
 |-- pressure: float (nullable = true)
 |-- visibility: float (nullable = true)
 |-- windDir: string (nullable = true)
 |-- windSpeed: float (nullable = true)
 |-- gustSpeed: float (nullable = tr

In [120]:
sampe.show(1)

+--------+---------------+---------+--------------+-------------+---------------+--------------+---------+------+--------+---------------+---------------+-------+-----------+-----------+----+---------+--------+--------+--------+----------+--------+---------+---------+------+------+-------------+------+------------+-----------------+-----------+---------+-------------+-----------+--------------+----------+----------------+----------+---------------+---------------+------------------------+----------------+-----------------+---------------------+----------------------+-------------+------------+----------+-----------+----------+----------+------------+--------------------+------------+----------+------------+--------------+---------------+---------+---------+---------+----------------+-------------+-------------------+-----------+--------------+--------+-----------------+-----------+---------+-----------+----------------+--------------+-----------------+-------------+-----------------+--

In [None]:
# #colum = sampe.schema.names
# sampe = sampe.withColumn("PLocationID", sampe.PULocationID.cast(IntegerType())).drop("PULocationID")
# sampe = sampe.withColumn("DLocationID", sampe.DOLocationID.cast(IntegerType())).drop("DOLocationID")
# #sampe.printSchema()


In [None]:
# sampe = sampe.withColumn("DLocationID", sampe.DOLocationID.cast(IntegerType())).drop("DOLocationID")
# #sampe.printSchema()

In [135]:
# # Drop any string columns except 'PUZone'
# str_names = ['PUZone','tpep_pickup_datetime','tpep_dropoff_datetime','store_and_fwd_flag','PULocationID',\
#              'LocationID','PUBorough','PUServiceZone','PUneighbor','PUTime','PUTemptime',\
#              'PUwindDir','Events','Conditions','period','neighborhood','PUDate', 'PUTempdate']
# for col in str_names:
#     sampe=sampe.drop(col)

In [154]:
colum = sampe.schema.names

In [155]:
colum.remove('DLocationID')

In [138]:
sampe.printSchema()

root
 |-- VendorID: short (nullable = true)
 |-- passenger_count: short (nullable = true)
 |-- uniqueIdColumn: long (nullable = false)
 |-- AirportPU: integer (nullable = false)
 |-- PUHour: integer (nullable = true)
 |-- PUMinute: integer (nullable = true)
 |-- MorningRushHour: integer (nullable = false)
 |-- EveningRushHour: integer (nullable = false)
 |-- Weekend: integer (nullable = false)
 |-- WorkingHour: integer (nullable = false)
 |-- DLocationID: integer (nullable = true)
 |-- temp: float (nullable = true)
 |-- windchill: float (nullable = true)
 |-- dewpoint: float (nullable = true)
 |-- humidity: float (nullable = true)
 |-- pressure: float (nullable = true)
 |-- visibility: float (nullable = true)
 |-- windSpeed: float (nullable = true)
 |-- gustSpeed: float (nullable = true)
 |-- Precip: float (nullable = true)
 |-- PUalone_hhld: float (nullable = true)
 |-- PUbachelor_higher: float (nullable = true)
 |-- PUbornstate: float (nullable = true)
 |-- PUcarfree: float (nullable

In [156]:
assembler = VectorAssembler(
    inputCols=colum,
    outputCol="features")

In [157]:
# indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
# indexerModel = indexer.fit(train)

In [158]:

# # Create new column "indexed" with categorical values transformed to indices
# putput = indexerModel.transform(train)
# indexedData.show()


output = assembler.transform(sampe)

In [142]:
output.printSchema()

root
 |-- VendorID: short (nullable = true)
 |-- passenger_count: short (nullable = true)
 |-- uniqueIdColumn: long (nullable = false)
 |-- AirportPU: integer (nullable = false)
 |-- PUHour: integer (nullable = true)
 |-- PUMinute: integer (nullable = true)
 |-- MorningRushHour: integer (nullable = false)
 |-- EveningRushHour: integer (nullable = false)
 |-- Weekend: integer (nullable = false)
 |-- WorkingHour: integer (nullable = false)
 |-- DLocationID: integer (nullable = true)
 |-- temp: float (nullable = true)
 |-- windchill: float (nullable = true)
 |-- dewpoint: float (nullable = true)
 |-- humidity: float (nullable = true)
 |-- pressure: float (nullable = true)
 |-- visibility: float (nullable = true)
 |-- windSpeed: float (nullable = true)
 |-- gustSpeed: float (nullable = true)
 |-- Precip: float (nullable = true)
 |-- PUalone_hhld: float (nullable = true)
 |-- PUbachelor_higher: float (nullable = true)
 |-- PUbornstate: float (nullable = true)
 |-- PUcarfree: float (nullable

In [159]:
train = output.select(["DLocationID","features"])

In [160]:
train = train.withColumn("label", train["DLocationID"]).drop("DLocationID")

In [161]:
# train.select('features').limit(1).collect()

In [162]:
train = train.select(["label","features"]).cache()

In [163]:
train.show(1)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|   48|(743,[0,1,2,8,10,...|
+-----+--------------------+
only showing top 1 row



In [None]:
train.printSchema()

In [164]:

pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")

In [165]:
model = pca.fit(train)


In [166]:
result = model.transform(train).select("pcaFeatures")
result.show(truncate=False)

+---------------------------------------+
|pcaFeatures                            |
+---------------------------------------+
|[26.023686636723923,247726.59693693675]|
|[29.015324407825286,156548.5667324401] |
|[474.02368663693784,247726.59684549813]|
|[964.0297125817553,318165.0364166201]  |
|[1677.0303843385202,310322.3119270896] |
|[1697.0229290713992,240068.81552190712]|
|[1950.0160566075313,154904.56860120725]|
|[2040.0229290713637,240068.81549361214]|
|[2214.0170099954994,168783.5407976865] |
|[2250.0236866377577,247726.59678751233]|
|[2453.0160566074665,154904.568559938]  |
|[2509.0297126401133,318165.66026743833]|
|[2529.01554364589,154789.2029755792]   |
|[2927.021122521916,226354.8153441903]  |
|[3091.01605660743,154904.56850513894]  |
|[3506.0150348177845,151335.64207004395]|
|[3764.0160566073596,154904.5684494989] |
|[4894.023686639948,247726.59654864747] |
|[5385.0160566096565,154904.56829531043]|
|[5409.023686631496,247726.5615399471]  |
+---------------------------------

In [151]:
result.count()

505

In [152]:
train.count()

505

In [None]:
# rows = train.select('features').rdd

In [None]:
# mat = RowMatrix(rows)

In [None]:
# pc = mat.computePrincipalComponents(4)

In [None]:
# projected = mat.multiply(pc)


In [None]:
# splits = train.randomSplit([0.6, 0.4], 1234)
# train = splits[0]
# test = splits[1]

In [None]:

# layers = [4, 5, 4, 3]

# # create the trainer and set its parameters
# trainer = MultilayerPerceptronClassifier(maxIter=2, layers=layers, blockSize=128, seed=1234)

In [None]:
# # train the model
# model = trainer.fit(train)

# # compute accuracy on the test set
# result = model.transform(test)
# predictionAndLabels = result.select("prediction", "label")
# evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
# print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))