# ЗАДАНИЕ 4. Анализ поездок и Spark GraphFrames

In [1]:
import pyspark
from pyspark.sql import SparkSession, Row

In [2]:
packages = "graphframes:graphframes:0.6.0-spark2.3-s_2.11"

In [3]:
import json
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geopandas.tools import sjoin

In [4]:
import folium
from folium.plugins import HeatMap, HeatMapWithTime

In [5]:
conf = pyspark.SparkConf() \
        .set("spark.executor.memory", "1g") \
        .set("spark.executor.core", "2") \
        .set("spark.jars.packages", "graphframes:graphframes:0.6.0-spark2.3-s_2.11")\
        .setAppName("moviewRecomApp")

In [6]:
spark = SparkSession \
    .builder \
    .appName("tripApp") \
    .config(conf=conf) \
    .getOrCreate()

In [7]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType

In [8]:
import graphframes as gf

In [9]:
trip_schema = StructType([
    StructField(name="tripduration", dataType=IntegerType(), nullable=True),
    StructField("starttime", TimestampType(), True),
    StructField("stoptime", TimestampType(), True),
    StructField("start_station_id", IntegerType(), True),
    StructField("start_station_name", StringType(), True),
    StructField("start_station_latitude", DoubleType(), True),
    StructField("start_station_longitude", DoubleType(), True),
    StructField("end_station_id", IntegerType(), True),
    StructField("end_station_name", StringType(), True),
    StructField("end_station_latitude", DoubleType(), True),
    StructField("end_station_longitude", DoubleType(), True),
    StructField("bike_id", IntegerType(), True),
    StructField("usertype", StringType(), True),
    StructField("birth_year", IntegerType(), True),
    StructField("gender", IntegerType(), True)])

In [10]:
trips_data_path = "data/201902-citibike-tripdata.csv"
trips = spark.read.load(trips_data_path, format="csv", header="true", schema=trip_schema, inferSchema="false", sep=",")

print("Количество маршрутов:", trips.count())
trips.show(5)

Количество маршрутов: 943744
+------------+--------------------+--------------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+----------+----------+------+
|tripduration|           starttime|            stoptime|start_station_id|  start_station_name|start_station_latitude|start_station_longitude|end_station_id|    end_station_name|end_station_latitude|end_station_longitude|bike_id|  usertype|birth_year|gender|
+------------+--------------------+--------------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+----------+----------+------+
|         219|2019-02-01 00:00:...|2019-02-01 00:03:...|            3494|E 115 St & Lexing...|             40.797911|               -73.9423|          3501|E 118 St & Madiso...|          40.8014866

In [11]:
trips.printSchema()

root
 |-- tripduration: integer (nullable = true)
 |-- starttime: timestamp (nullable = true)
 |-- stoptime: timestamp (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_latitude: double (nullable = true)
 |-- start_station_longitude: double (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_latitude: double (nullable = true)
 |-- end_station_longitude: double (nullable = true)
 |-- bike_id: integer (nullable = true)
 |-- usertype: string (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- gender: integer (nullable = true)



## Задача 1

**1. определите для каждой станции количество начала поездок и количество завершения поездок**

In [12]:
from pyspark.sql.functions import desc,split,explode,count,avg,coalesce

stations = trips\
    .select(F.col('start_station_id').alias('id'), 
            F.col('start_station_name').alias('name'),
            F.col("start_station_latitude").alias('lat'), 
            F.col("start_station_longitude").alias('lng'))\
    .filter('lat is not null and lng is not null')\
    .distinct()
print('Количество станций: {}'.format(stations.count()))
print('Станции:')
stations.show()

print('Подсчет коиличеств поездок...')
stations = stations\
    .join(trips, stations.id == trips.start_station_id, how='left')\
    .groupBy('id', 'name', 'lat', 'lng') \
    .agg({'*': 'count'})\
    .withColumnRenamed('count(1)', 'start_cnt')\
    .join(trips, stations.id == trips.end_station_id, how='left')\
    .groupBy('id', 'name', 'lat', 'lng', 'start_cnt') \
    .agg({'*': 'count'})\
    .withColumnRenamed('count(1)', 'end_cnt')
stations.show()

Количество станций: 767
Станции:
+----+--------------------+-----------------+------------------+
|  id|                name|              lat|               lng|
+----+--------------------+-----------------+------------------+
| 312|Allen St & Stanto...|        40.722055|        -73.989111|
|3509|Lenox Ave & W 115 St|       40.8011939|       -73.9500739|
| 351|Front St & Maiden Ln|      40.70530954|      -74.00612572|
|3419| Douglass St & 4 Ave|       40.6792788|      -73.98154004|
| 259|South St & Whiteh...|      40.70122128|      -74.01234218|
|2017|     E 43 St & 2 Ave|      40.75022392|      -73.97121414|
| 504|     1 Ave & E 16 St|      40.73221853|      -73.98165557|
|3242|Schermerhorn St &...|40.69102925677968|-73.99183362722397|
|3491|    E 118 St & 1 Ave|         40.79747|         -73.93504|
|3664|North Moore St & ...|40.72019521437465| -74.0103006362915|
|3260|Mercer St & Bleec...|40.72706363348306|-73.99662137031554|
|3059|Pulaski St & Marc...|       40.6933982|        -73.

2. сопоставьте станции с кварталами города (zones) и определите суммы количества начала и завершения для кажого квартала выведите по убыванию и отобразите в виде картограмм (Choropleth)

In [13]:
from IPython.display import IFrame

def embed_map(m):
    m.save('index.html')
    return IFrame('index.html', width='100%', height='750px')

# выводим станции на карту
m = folium.Map()
for index, row in stations.toPandas().iterrows():
    folium.CircleMarker(location=(row["lat"], row["lng"]),
                        weight=1,
                        radius= 4,
                        color="#E3170A",
                        fill_color="#E3170A",
                        fill_opacity=0.3,
                        fill=True).add_to(m)
m.fit_bounds(m.get_bounds())
embed_map(m)

In [14]:
borough_data_path = "data/NYC Taxi Zones.geojson"

style_function = lambda x: {
    "color" : "#7EBDC3",
    "weight": 1
}

# выводим зоны на карту
folium.GeoJson(borough_data_path, name="geojson", style_function=style_function).add_to(m)
m.fit_bounds(m.get_bounds())
embed_map(m)

`Convert the zones GeoJson to GeoDataFrame:`

In [15]:
with open(borough_data_path) as f:
    zones_geojson = json.load(f)

column_name_list = [key for key, value in zones_geojson["features"][0]["properties"].items()]
column_name_list += ["geometry"]
column_name_list

['shape_area',
 'objectid',
 'shape_leng',
 'location_id',
 'zone',
 'borough',
 'geometry']

In [16]:
def get_pandas_rows(features):
    for item in features:
        row = list()
        for key, value in item["properties"].items():
            row.append(value)        
        polygons = list()
        for polygon in item["geometry"]["coordinates"]:
            polygons.append(Polygon(polygon[0]))
        row.append(MultiPolygon(polygons=polygons))
        yield row

In [17]:
zones = pd.DataFrame(get_pandas_rows(zones_geojson["features"]), columns=column_name_list)
zones.head(5)

Unnamed: 0,shape_area,objectid,shape_leng,location_id,zone,borough,geometry
0,0.0007823067885,1,0.116357453189,1,Newark Airport,EWR,(POLYGON ((-74.18445299999996 40.6949959999999...
1,0.00486634037837,2,0.43346966679,2,Jamaica Bay,Queens,(POLYGON ((-73.82337597260663 40.6389870471767...
2,0.000314414156821,3,0.0843411059012,3,Allerton/Pelham Gardens,Bronx,(POLYGON ((-73.84792614099985 40.8713422339999...
3,0.000111871946192,4,0.0435665270921,4,Alphabet City,Manhattan,(POLYGON ((-73.97177410965318 40.7258212813370...
4,0.000497957489363,5,0.0921464898574,5,Arden Heights,Staten Island,(POLYGON ((-74.17421738099989 40.5625680859999...


In [18]:
gdf_zones = gpd.GeoDataFrame(zones, geometry=zones["geometry"])
gdf_zones.head(5)

Unnamed: 0,shape_area,objectid,shape_leng,location_id,zone,borough,geometry
0,0.0007823067885,1,0.116357453189,1,Newark Airport,EWR,"MULTIPOLYGON (((-74.18445 40.69500, -74.18449 ..."
1,0.00486634037837,2,0.43346966679,2,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,0.000314414156821,3,0.0843411059012,3,Allerton/Pelham Gardens,Bronx,"MULTIPOLYGON (((-73.84793 40.87134, -73.84725 ..."
3,0.000111871946192,4,0.0435665270921,4,Alphabet City,Manhattan,"MULTIPOLYGON (((-73.97177 40.72582, -73.97179 ..."
4,0.000497957489363,5,0.0921464898574,5,Arden Heights,Staten Island,"MULTIPOLYGON (((-74.17422 40.56257, -74.17349 ..."


In [19]:
# объединяем с таблицей зон
stations_p = stations.toPandas()
points = gpd.GeoDataFrame(stations_p, geometry=gpd.points_from_xy(stations_p['lng'], stations_p['lat']))
stations_joined = sjoin(points, gdf_zones, how="left")
stations_joined

Unnamed: 0,id,name,lat,lng,start_cnt,end_cnt,geometry,index_right,shape_area,objectid,shape_leng,location_id,zone,borough
0,471,Grand St & Havemeyer St,40.712868,-73.956981,870,873,POINT (-73.95698 40.71287),257,0.000172309184842,255,0.0623841997664,255,Williamsburg (North Side),Brooklyn
1,496,E 16 St & 5 Ave,40.737262,-73.992390,3275,3301,POINT (-73.99239 40.73726),234,0.0000731054382894,234,0.0360721994984,234,Union Sq,Manhattan
2,3175,W 70 St & Amsterdam Ave,40.777480,-73.982886,1786,1761,POINT (-73.98289 40.77748),146,0.00015109426901,143,0.0541798538849,143,Lincoln Square West,Manhattan
3,243,Fulton St & Rockwell Pl,40.688226,-73.979382,588,824,POINT (-73.97938 40.68823),97,0.000163303970435,97,0.0624760147423,97,Fort Greene,Brooklyn
4,392,Jay St & Tech Pl,40.695065,-73.987167,1034,1101,POINT (-73.98717 40.69506),64,0.000081803882541,65,0.0446070683658,65,Downtown Brooklyn/MetroTech,Brooklyn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,3053,Marcy Ave & Lafayette Ave,40.690081,-73.947915,180,158,POINT (-73.94791 40.69008),21,0.000322957654799,17,0.093522632948,17,Bedford,Brooklyn
763,3245,NYCBS DEPOT - DELANCEY,40.716444,-73.982331,2,20,POINT (-73.98233 40.71644),232,0.000216049973456,232,0.0614709085331,232,Two Bridges/Seward Park,Manhattan
764,3376,E 65 St & 2 Ave,40.764719,-73.962221,818,831,POINT (-73.96222 40.76472),142,0.0000766545579019,141,0.0415144638712,141,Lenox Hill West,Manhattan
765,3523,24 Ave & 29 St,40.772900,-73.916142,129,133,POINT (-73.91614 40.77290),225,0.00076436070058,223,0.166021925275,223,Steinway,Queens


In [20]:
# выводим суммы
count_for_zone = stations_joined\
    .groupby(["location_id", "zone"])\
    ['start_cnt', 'end_cnt'].sum()\
    .reset_index()
count_for_zone.sort_values(['start_cnt', 'end_cnt', 'location_id', 'zone'], ascending=False)

Unnamed: 0,location_id,zone,start_cnt,end_cnt
85,79,East Village,43333,43170
81,68,East Chelsea,38967,39395
27,170,Murray Hill,32572,32578
49,234,Union Sq,30527,31464
4,113,Greenwich Village North,26485,27054
...,...,...,...,...
78,62,Crown Heights South,676,712
42,226,Sunnyside,607,621
86,8,Astoria Park,400,398
43,228,Sunset Park West,397,484


Отображаем на карте сначала по количеству начала поездок, затем по количеству окончания поездок

In [21]:
m = folium.Map()

folium.Choropleth(
    geo_data=zones_geojson,
    data=count_for_zone,
    columns=["location_id", "start_cnt"],
    name="Количество посадок",
    legend_name="Количество посадок",
    key_on="feature.properties.location_id",
    highlight=True,
    nan_fill_color="grey",
    nan_fill_opacity=0.1,
    fill_color="YlOrRd",
    fill_opacity=0.7,
    line_opacity=0.2,
).add_to(m)
m.fit_bounds(m.get_bounds())
embed_map(m)

In [22]:
m = folium.Map()

folium.Choropleth(
    geo_data=zones_geojson,
    data=count_for_zone,
    columns=["location_id", "end_cnt"],
    name="Количества высадок",
    legend_name="Количества высадок",
    key_on="feature.properties.location_id",
    highlight=True,
    nan_fill_color="grey",
    nan_fill_opacity=0.1,
    fill_color="YlOrRd",
    fill_opacity=0.5,
    line_opacity=0.3,
).add_to(m)
m.fit_bounds(m.get_bounds())
embed_map(m)

## Задача 2
1. определите "важность" вершин графа поездок с использование PageRank

In [23]:
stations = trips\
    .select(F.col('start_station_id').alias('id'), 
            F.col('start_station_name').alias('name'),
            F.col("start_station_latitude").alias('lat'), 
            F.col("start_station_longitude").alias('lng'))\
    .filter('lat is not null and lng is not null')\
    .distinct()
stations.show()

edges = trips\
    .select(F.col("start_station_id").alias("src"), 
            F.col("end_station_id").alias("dst"), 
            "tripduration")\
    .dropna(subset=['src'])\
    .dropna(subset=['dst'])

print("Общее количесвто ребер:", edges.count())
edges.show()

+----+--------------------+-----------------+------------------+
|  id|                name|              lat|               lng|
+----+--------------------+-----------------+------------------+
| 312|Allen St & Stanto...|        40.722055|        -73.989111|
|3509|Lenox Ave & W 115 St|       40.8011939|       -73.9500739|
| 351|Front St & Maiden Ln|      40.70530954|      -74.00612572|
|3419| Douglass St & 4 Ave|       40.6792788|      -73.98154004|
| 259|South St & Whiteh...|      40.70122128|      -74.01234218|
|2017|     E 43 St & 2 Ave|      40.75022392|      -73.97121414|
| 504|     1 Ave & E 16 St|      40.73221853|      -73.98165557|
|3242|Schermerhorn St &...|40.69102925677968|-73.99183362722397|
|3491|    E 118 St & 1 Ave|         40.79747|         -73.93504|
|3664|North Moore St & ...|40.72019521437465| -74.0103006362915|
|3260|Mercer St & Bleec...|40.72706363348306|-73.99662137031554|
|3059|Pulaski St & Marc...|       40.6933982|        -73.939877|
| 423|     W 54 St & 9 Av

In [24]:
routes = gf.GraphFrame(stations, edges)
routes.triplets.show()

+--------------------+-----------------+--------------------+
|                 src|             edge|                 dst|
+--------------------+-----------------+--------------------+
|[471, Grand St & ...|   [471, 471, 66]|[471, Grand St & ...|
|[471, Grand St & ...|[471, 471, 13260]|[471, Grand St & ...|
|[471, Grand St & ...|  [471, 471, 582]|[471, Grand St & ...|
|[471, Grand St & ...| [471, 471, 2626]|[471, Grand St & ...|
|[471, Grand St & ...| [471, 471, 2649]|[471, Grand St & ...|
|[471, Grand St & ...|  [471, 471, 870]|[471, Grand St & ...|
|[471, Grand St & ...|  [471, 471, 784]|[471, Grand St & ...|
|[471, Grand St & ...|   [471, 471, 80]|[471, Grand St & ...|
|[471, Grand St & ...|  [471, 471, 997]|[471, Grand St & ...|
|[471, Grand St & ...|   [471, 471, 62]|[471, Grand St & ...|
|[471, Grand St & ...| [471, 471, 1127]|[471, Grand St & ...|
|[471, Grand St & ...|  [471, 471, 198]|[471, Grand St & ...|
|[471, Grand St & ...|  [471, 471, 347]|[471, Grand St & ...|
|[471, G

In [25]:
# Подсчитываю важность станций
# PageRank — это числовая величина, характеризующая «важность» узла. 
# Чем больше ребер соединено с ним, тем он «важнее», также учитывается вес ребер.
# Здесь PageRank - условно это число, которое характеризует как много нороду проезжает через эту станцию. 
routes_pr = routes.pageRank(resetProbability=0.1, maxIter=5)

2. Выводим по убыванию

In [26]:
routes_pr.vertices.orderBy(-F.col("pagerank")).show()

+----+--------------------+------------------+------------------+------------------+
|  id|                name|               lat|               lng|          pagerank|
+----+--------------------+------------------+------------------+------------------+
| 519|Pershing Square N...|         40.751873|        -73.977706| 4.406774001083143|
| 402|  Broadway & E 22 St|        40.7403432|      -73.98955109|  3.53228723171532|
| 435|     W 21 St & 6 Ave|       40.74173969|      -73.99415556| 3.269230720393528|
|3255|     8 Ave & W 31 St|  40.7505853470215| -73.9946848154068|3.2361769412840924|
| 497|  E 17 St & Broadway|       40.73704984|      -73.99009296|3.1645640088778486|
| 285|  Broadway & E 14 St|       40.73454567|      -73.99074142|3.0455207021823973|
|3429|Hanson Pl & Ashla...| 40.68506807308177|-73.97790759801863|2.8908249353666955|
| 359|  E 47 St & Park Ave|       40.75510267|      -73.97498696| 2.887482309374865|
| 477|     W 41 St & 8 Ave|       40.75640548|       -73.9900262|

3. Отображаем на тепловой карте

In [27]:
trips_matrix = routes_pr.vertices.toPandas()[["lat", "lng", "pagerank"]].values
m = folium.Map()
HeatMap(trips_matrix, radius=15).add_to(m)
m.fit_bounds(m.get_bounds())
embed_map(m)

## Задача 3

оцените дистанцию поездок (в метрах) на основе координат начальной и конечной станций

In [28]:
from pyspark.sql.functions import acos, cos, sin, lit, toRadians

def distance(start_lng, start_lat, end_lng, end_lat):
    return acos(
        sin(toRadians(start_lat)) * sin(toRadians(end_lat)) +
        cos(toRadians(start_lat)) * cos(toRadians(end_lat)) *
        cos(toRadians(start_lng) - toRadians(end_lng))
    ) * lit(6371000.0)

выведите максимальное, среднее значение, стандартное отклонение и медиан

In [29]:
trips_new = trips.dropna(subset=['start_station_id']).dropna(subset=['end_station_id'])

trips_routes = trips_new\
            .dropDuplicates(['start_station_id','end_station_id'])\
            .filter(F.col("start_station_id") != F.col("end_station_id"))\
            .withColumn("distance", distance("start_station_longitude",
                                           "start_station_latitude", 
                                           "end_station_longitude",
                                           "end_station_latitude"))\

print("Количество маршрутов: {}".format(trips_routes.count()))
trips_routes.show()

Количество маршрутов: 131259
+------------+--------------------+--------------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+----------+----------+------+------------------+
|tripduration|           starttime|            stoptime|start_station_id|  start_station_name|start_station_latitude|start_station_longitude|end_station_id|    end_station_name|end_station_latitude|end_station_longitude|bike_id|  usertype|birth_year|gender|          distance|
+------------+--------------------+--------------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+----------+----------+------+------------------+
|         686|2019-02-03 08:02:...|2019-02-03 08:13:...|              72|    W 52 St & 11 Ave|           40.76727216|           -73.99392888

In [30]:
trips_routes.describe("distance").show()

+-------+------------------+
|summary|          distance|
+-------+------------------+
|  count|            131259|
|   mean|2818.1040340510817|
| stddev| 1813.459185512828|
|    min| 36.73591246092364|
|    max|15326.431486453896|
+-------+------------------+



In [31]:
median = trips_routes.approxQuantile("distance", [0.5], 0)[0]
print('Медиана = {}'.format(median))

Медиана = 2435.3733872294574


## Задача 4
1. определите для каждой станции среднее количество начала поездок и количество завершения поездок:

а) в день

In [32]:
from pyspark.sql.functions  import date_format, unix_timestamp, hour
# для старта поездок. конвертируем в дни, конвертируем в дату, считаем число дней каждого дня недели
# итого в выводе получаем сопоставление станции и количества поездок в определнную дату
trips_routes_days_start = trips_new\
    .select(F.col('start_station_id').alias('id'), 
            F.col('start_station_name').alias('name'),
            F.col("starttime"))\
    .withColumn("weekday", date_format("starttime","E"))\
    .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
    .groupBy("id", 'name', "weekday", "starttime_unix") \
    .agg(F.count("weekday").alias("count"))\
    .orderBy("count", ascending=False)
trips_routes_days_start.show()

# агрегируем полученные данные в нужном формате
print("Среднее количество начала поездок для каждого дня недели:")
trips_routes_days_start\
    .groupBy("id", "weekday", 'name')\
    .agg(F.avg("count").alias("avg"))\
    .orderBy("avg", ascending=False).show()

print("Среднее количество начала поездок в день:")
trips_routes_days_start\
    .groupBy("id", 'name')\
    .agg(F.avg("count").alias("avg"))\
    .orderBy("avg", ascending=False).show()

+----+--------------------+-------+--------------+-----+
|  id|                name|weekday|starttime_unix|count|
+----+--------------------+-------+--------------+-----+
| 519|Pershing Square N...|    Tue|      20190205|  579|
| 519|Pershing Square N...|    Mon|      20190204|  534|
| 519|Pershing Square N...|    Mon|      20190211|  501|
| 519|Pershing Square N...|    Fri|      20190215|  468|
| 519|Pershing Square N...|    Thu|      20190207|  459|
| 519|Pershing Square N...|    Mon|      20190225|  441|
| 519|Pershing Square N...|    Thu|      20190214|  433|
| 519|Pershing Square N...|    Tue|      20190219|  416|
| 519|Pershing Square N...|    Fri|      20190222|  414|
| 402|  Broadway & E 22 St|    Tue|      20190205|  397|
| 519|Pershing Square N...|    Thu|      20190228|  388|
|3255|     8 Ave & W 31 St|    Mon|      20190211|  376|
| 517|Pershing Square S...|    Thu|      20190221|  376|
| 519|Pershing Square N...|    Fri|      20190208|  375|
|3255|     8 Ave & W 31 St|    

In [33]:
trips_routes_days_end = trips_new\
    .select(F.col('end_station_id').alias('id'), 
            F.col('end_station_name').alias('name'),
            F.col("starttime"))\
    .withColumn("weekday", date_format("starttime","E"))\
    .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
    .groupBy("id", 'name', "weekday", "starttime_unix") \
    .agg(F.count("weekday").alias("count"))\
    .orderBy("count", ascending=False)
trips_routes_days_end.show()

# агрегируем полученные данные в нужном формате
print("Среднее количество начала поездок для каждого дня недели:")
trips_routes_days_end\
    .groupBy("id", "weekday", 'name')\
    .agg(F.avg("count").alias("avg"))\
    .orderBy("avg", ascending=False).show()

print("Среднее количество начала поездок в день:")
trips_routes_days_end\
    .groupBy("id", 'name')\
    .agg(F.avg("count").alias("avg"))\
    .orderBy("avg", ascending=False).show()

+----+--------------------+-------+--------------+-----+
|  id|                name|weekday|starttime_unix|count|
+----+--------------------+-------+--------------+-----+
| 519|Pershing Square N...|    Tue|      20190205|  581|
| 519|Pershing Square N...|    Mon|      20190211|  501|
| 519|Pershing Square N...|    Mon|      20190204|  500|
| 519|Pershing Square N...|    Fri|      20190215|  470|
| 519|Pershing Square N...|    Thu|      20190207|  442|
| 519|Pershing Square N...|    Mon|      20190225|  440|
| 519|Pershing Square N...|    Thu|      20190214|  437|
| 519|Pershing Square N...|    Tue|      20190219|  416|
| 519|Pershing Square N...|    Wed|      20190206|  406|
| 402|  Broadway & E 22 St|    Tue|      20190205|  403|
| 519|Pershing Square N...|    Fri|      20190222|  397|
|3255|     8 Ave & W 31 St|    Tue|      20190205|  396|
| 519|Pershing Square N...|    Thu|      20190228|  392|
| 519|Pershing Square N...|    Wed|      20190213|  380|
| 402|  Broadway & E 22 St|    

б) утром (06:00-11:59), днем (12:00-17:59), вечером (18:00-23:59), ночью (00:00-05:59)

In [34]:
# выведем следнее количество по часам, вообще не привязываясь к дням недели
start_trips_hours_cnt = trips_new\
    .select(F.col('start_station_id').alias('id'), 
            F.col('start_station_name').alias('name'),
            F.col('start_station_latitude').alias('lat'),
            F.col('start_station_longitude').alias('lng'),
            F.col("starttime"), 
            hour(F.col('starttime')).alias('hour'))\
    .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
    .groupBy('id','hour','name', 'lat', 'lng','starttime_unix')\
    .agg(F.count('hour').alias('count'))\
    .orderBy('count', ascending=False)
start_trips_hours_cnt.show()

print("Среднее число начала поездок в течение дня:")

print("Ночью:")
start_trips_night_cnt = start_trips_hours_cnt\
        .filter(F.col('hour') < 6)\
        .filter(F.col('hour') >= 0)\
        .groupBy('id','name', 'lat', 'lng')\
        .agg(F.avg('count').alias('avg'))
start_trips_night_cnt.orderBy('avg', ascending=False).show()

print("Утром:")
start_trips_morning_cnt = start_trips_hours_cnt\
    .filter(F.col('hour') < 12)\
    .filter(F.col('hour') >= 6)\
    .groupBy('id','name', 'lat', 'lng')\
    .agg(F.avg('count').alias('avg'))
start_trips_morning_cnt.orderBy('avg', ascending=False).show()

print("Днем:")
start_trips_afternoon_cnt = start_trips_hours_cnt\
    .filter(F.col('hour') < 18)\
    .filter(F.col('hour') >= 12)\
    .groupBy('id','name', 'lat', 'lng')\
    .agg(F.avg('count').alias('avg'))
start_trips_afternoon_cnt.orderBy('avg', ascending=False).show()

print("Вечером:")
start_trips_evening_cnt = start_trips_hours_cnt\
    .filter(F.col('hour') < 24)\
    .filter(F.col('hour') >= 18)\
    .groupBy('id','name', 'lat', 'lng')\
    .agg(F.avg('count').alias('avg'))
start_trips_evening_cnt.orderBy('id').show()

+----+----+--------------------+-----------------+------------------+--------------+-----+
|  id|hour|                name|              lat|               lng|starttime_unix|count|
+----+----+--------------------+-----------------+------------------+--------------+-----+
| 519|  17|Pershing Square N...|        40.751873|        -73.977706|      20190205|  125|
| 519|  18|Pershing Square N...|        40.751873|        -73.977706|      20190205|  117|
| 519|  17|Pershing Square N...|        40.751873|        -73.977706|      20190211|   95|
|3255|   8|     8 Ave & W 31 St| 40.7505853470215| -73.9946848154068|      20190211|   93|
| 519|  17|Pershing Square N...|        40.751873|        -73.977706|      20190208|   92|
| 519|  18|Pershing Square N...|        40.751873|        -73.977706|      20190219|   87|
| 519|  18|Pershing Square N...|        40.751873|        -73.977706|      20190204|   86|
| 359|  17|  E 47 St & Park Ave|      40.75510267|      -73.97498696|      20190204|   84|

+---+--------------------+------------------+------------------+------------------+
| id|                name|               lat|               lng|               avg|
+---+--------------------+------------------+------------------+------------------+
| 72|    W 52 St & 11 Ave|       40.76727216|      -73.99392888|3.4414414414414414|
| 79|Franklin St & W B...|       40.71911552|      -74.00666661|3.2916666666666665|
| 82|St James Pl & Pea...|       40.71117416|      -74.00016545| 1.941860465116279|
| 83|Atlantic Ave & Fo...|       40.68382604|      -73.97632328|2.3229166666666665|
|119|Park Ave & St Edw...|       40.69608941|      -73.97803415|1.2105263157894737|
|120|Lexington Ave & C...|       40.68676793|      -73.95928168|1.4736842105263157|
|127|Barrow St & Hudso...|       40.73172428|      -74.00674436|             5.312|
|128|MacDougal St & Pr...|       40.72710258|      -74.00297088|4.8311688311688314|
|143|Clinton St & Jora...|       40.69239502|      -73.99337909| 3.107142857

In [35]:
end_trips_hours_cnt = trips_new\
    .select(F.col('end_station_id').alias('id'), 
            F.col('end_station_name').alias('name'),
            F.col('end_station_latitude').alias('lat'),
            F.col('end_station_longitude').alias('lng'),
            F.col("starttime"), 
            hour(F.col('starttime')).alias('hour'))\
    .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
    .groupBy('id','hour','name', 'lat', 'lng','starttime_unix')\
    .agg(F.count('hour').alias('count'))\
    .orderBy('count', ascending=False)
end_trips_hours_cnt.show()

print("Среднее число конца поездок в течение дня:")

print("Ночью:")
end_trips_night_cnt = start_trips_hours_cnt\
        .filter(F.col('hour') < 6)\
        .filter(F.col('hour') >= 0)\
        .groupBy('id','name', 'lat', 'lng')\
        .agg(F.avg('count').alias('avg'))
end_trips_night_cnt.orderBy('avg', ascending=False).show()

print("Утром:")
end_trips_morning_cnt = start_trips_hours_cnt\
    .filter(F.col('hour') < 12)\
    .filter(F.col('hour') >= 6)\
    .groupBy('id','name', 'lat', 'lng')\
    .agg(F.avg('count').alias('avg'))
end_trips_morning_cnt.orderBy('avg', ascending=False).show()

print("Днем:")
end_trips_afternoon_cnt = start_trips_hours_cnt\
    .filter(F.col('hour') < 18)\
    .filter(F.col('hour') >= 12)\
    .groupBy('id','name', 'lat', 'lng')\
    .agg(F.avg('count').alias('avg'))
end_trips_afternoon_cnt.orderBy('avg', ascending=False).show()

print("Вечером:")
end_trips_evening_cnt = start_trips_hours_cnt\
    .filter(F.col('hour') < 24)\
    .filter(F.col('hour') >= 18)\
    .groupBy('id','name', 'lat', 'lng')\
    .agg(F.avg('count').alias('avg'))
end_trips_evening_cnt.orderBy('id').show()

+----+----+--------------------+-----------------+------------------+--------------+-----+
|  id|hour|                name|              lat|               lng|starttime_unix|count|
+----+----+--------------------+-----------------+------------------+--------------+-----+
| 519|  17|Pershing Square N...|        40.751873|        -73.977706|      20190205|  122|
|3255|  17|     8 Ave & W 31 St| 40.7505853470215| -73.9946848154068|      20190205|  116|
| 519|  17|Pershing Square N...|        40.751873|        -73.977706|      20190225|  115|
| 519|  17|Pershing Square N...|        40.751873|        -73.977706|      20190211|  111|
|3255|  17|     8 Ave & W 31 St| 40.7505853470215| -73.9946848154068|      20190204|  108|
| 359|   8|  E 47 St & Park Ave|      40.75510267|      -73.97498696|      20190205|  104|
| 359|   8|  E 47 St & Park Ave|      40.75510267|      -73.97498696|      20190206|  102|
|3255|  17|     8 Ave & W 31 St| 40.7505853470215| -73.9946848154068|      20190221|   99|

+---+--------------------+------------------+------------------+------------------+
| id|                name|               lat|               lng|               avg|
+---+--------------------+------------------+------------------+------------------+
| 72|    W 52 St & 11 Ave|       40.76727216|      -73.99392888|3.4414414414414414|
| 79|Franklin St & W B...|       40.71911552|      -74.00666661|3.2916666666666665|
| 82|St James Pl & Pea...|       40.71117416|      -74.00016545| 1.941860465116279|
| 83|Atlantic Ave & Fo...|       40.68382604|      -73.97632328|2.3229166666666665|
|119|Park Ave & St Edw...|       40.69608941|      -73.97803415|1.2105263157894737|
|120|Lexington Ave & C...|       40.68676793|      -73.95928168|1.4736842105263157|
|127|Barrow St & Hudso...|       40.73172428|      -74.00674436|             5.312|
|128|MacDougal St & Pr...|       40.72710258|      -74.00297088|4.8311688311688314|
|143|Clinton St & Jora...|       40.69239502|      -73.99337909| 3.107142857

в) в среду и в воскресенье по временным диапазонам (см. выше)

In [36]:
print("Среднее число начала поездок в среду в течение дня:")

start_wed_trips_hours = trips_new\
            .withColumn('weekday', date_format(F.col('starttime'), 'E'))\
            .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
            .filter((F.col('weekday') == 'Wed'))\
            .select(F.col('start_station_id').alias('id'), 
                    F.col('start_station_name').alias('name'),
                    F.col("starttime_unix"), 
                    hour(F.col('starttime')).alias('hour'))\
            .groupBy('id','name', 'hour','starttime_unix')\
            .agg(F.count('hour').alias('count'))\
            .orderBy('count', ascending=False)
start_wed_trips_hours.show()

print("Ночью:")
start_wed_trips_hours\
        .filter(F.col('hour') < 6)\
        .filter(F.col('hour') >= 0)\
        .groupBy('id','name')\
        .agg(F.avg('count').alias('avg'))\
        .orderBy('avg', ascending=False).show()

print("Утром:")
start_wed_trips_hours\
    .filter(F.col('hour') < 12)\
    .filter(F.col('hour') >= 6)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Днем:")
start_wed_trips_hours\
    .filter(F.col('hour') < 18)\
    .filter(F.col('hour') >= 12)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Вечером:")
start_wed_trips_hours\
    .filter(F.col('hour') < 24)\
    .filter(F.col('hour') >= 18)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('id').show()

Среднее число начала поездок в среду в течение дня:
+----+--------------------+----+--------------+-----+
|  id|                name|hour|starttime_unix|count|
+----+--------------------+----+--------------+-----+
|3255|     8 Ave & W 31 St|   6|      20190206|   74|
|3255|     8 Ave & W 31 St|   7|      20190206|   72|
| 519|Pershing Square N...|  17|      20190213|   67|
| 519|Pershing Square N...|   8|      20190227|   67|
|3664|North Moore St & ...|  17|      20190206|   65|
|3255|     8 Ave & W 31 St|   8|      20190206|   64|
| 359|  E 47 St & Park Ave|  17|      20190206|   60|
|3255|     8 Ave & W 31 St|   7|      20190220|   59|
| 359|  E 47 St & Park Ave|  17|      20190213|   58|
| 519|Pershing Square N...|   8|      20190206|   57|
| 519|Pershing Square N...|  17|      20190206|   55|
| 359|  E 47 St & Park Ave|  17|      20190227|   55|
|3443|     W 52 St & 6 Ave|  17|      20190206|   53|
| 514|    12 Ave & W 40 St|   8|      20190206|   52|
|3255|     8 Ave & W 31 St|   

In [37]:
print("Среднее число начала поездок в воскресенье в течение дня:")

start_wed_trips_hours = trips_new\
            .withColumn('weekday', date_format(F.col('starttime'), 'E'))\
            .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
            .filter((F.col('weekday') == 'Sun'))\
            .select(F.col('start_station_id').alias('id'), 
                    F.col('start_station_name').alias('name'),
                    F.col("starttime_unix"), 
                    hour(F.col('starttime')).alias('hour'))\
            .groupBy('id','name', 'hour','starttime_unix')\
            .agg(F.count('hour').alias('count'))\
            .orderBy('count', ascending=False)
start_wed_trips_hours.show()

print("Ночью:")
start_wed_trips_hours\
        .filter(F.col('hour') < 6)\
        .filter(F.col('hour') >= 0)\
        .groupBy('id','name')\
        .agg(F.avg('count').alias('avg'))\
        .orderBy('avg', ascending=False).show()

print("Утром:")
start_wed_trips_hours\
    .filter(F.col('hour') < 12)\
    .filter(F.col('hour') >= 6)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Днем:")
start_wed_trips_hours\
    .filter(F.col('hour') < 18)\
    .filter(F.col('hour') >= 12)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Вечером:")
start_wed_trips_hours\
    .filter(F.col('hour') < 24)\
    .filter(F.col('hour') >= 18)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('id').show()

Среднее число начала поездок в воскресенье в течение дня:
+----+--------------------+----+--------------+-----+
|  id|                name|hour|starttime_unix|count|
+----+--------------------+----+--------------+-----+
| 285|  Broadway & E 14 St|  15|      20190203|   32|
| 285|  Broadway & E 14 St|  16|      20190203|   31|
| 499|  Broadway & W 60 St|  15|      20190203|   31|
| 293|Lafayette St & E ...|  14|      20190203|   30|
| 435|     W 21 St & 6 Ave|  15|      20190224|   30|
|2006|Central Park S & ...|  14|      20190203|   27|
| 435|     W 21 St & 6 Ave|  17|      20190203|   26|
| 285|  Broadway & E 14 St|  13|      20190224|   25|
| 497|  E 17 St & Broadway|  17|      20190224|   25|
| 435|     W 21 St & 6 Ave|  14|      20190203|   25|
| 285|  Broadway & E 14 St|  16|      20190224|   25|
| 459|    W 20 St & 11 Ave|  13|      20190203|   25|
| 285|  Broadway & E 14 St|  17|      20190224|   25|
| 497|  E 17 St & Broadway|  15|      20190203|   24|
|3435|Grand St & Elizab.

In [39]:
print("Среднее число конца поездок в среду в течение дня:")

start_wed_trips_hours = trips_new\
            .withColumn('weekday', date_format(F.col('starttime'), 'E'))\
            .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
            .filter((F.col('weekday') == 'Wed'))\
            .select(F.col('end_station_id').alias('id'), 
                    F.col('end_station_name').alias('name'),
                    F.col("starttime_unix"),
                    F.col("starttime_unix"),
                    F.col("starttime_unix"), 
                    hour(F.col('starttime')).alias('hour'))\
            .groupBy('id','name', 'hour','starttime_unix')\
            .agg(F.count('hour').alias('count'))\
            .orderBy('count', ascending=False)
start_wed_trips_hours.show()

print("Ночью:")
start_wed_trips_hours\
        .filter(F.col('hour') < 6)\
        .filter(F.col('hour') >= 0)\
        .groupBy('id','name')\
        .agg(F.avg('count').alias('avg'))\
        .orderBy('avg', ascending=False).show()

print("Утром:")
start_wed_trips_hours\
    .filter(F.col('hour') < 12)\
    .filter(F.col('hour') >= 6)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Днем:")
start_wed_trips_hours\
    .filter(F.col('hour') < 18)\
    .filter(F.col('hour') >= 12)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Вечером:")
start_wed_trips_hours\
    .filter(F.col('hour') < 24)\
    .filter(F.col('hour') >= 18)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('id').show()

Среднее число конца поездок в среду в течение дня:
+----+--------------------+----+--------------+-----+
|  id|                name|hour|starttime_unix|count|
+----+--------------------+----+--------------+-----+
| 359|  E 47 St & Park Ave|   8|      20190206|  102|
|3255|     8 Ave & W 31 St|  17|      20190206|   78|
| 517|Pershing Square S...|  17|      20190227|   75|
| 359|  E 47 St & Park Ave|   8|      20190227|   73|
| 402|  Broadway & E 22 St|   9|      20190206|   72|
|3255|     8 Ave & W 31 St|  17|      20190213|   69|
| 402|  Broadway & E 22 St|   9|      20190227|   68|
| 359|  E 47 St & Park Ave|   8|      20190220|   67|
|3443|     W 52 St & 6 Ave|   8|      20190206|   66|
| 517|Pershing Square S...|  17|      20190206|   64|
|3255|     8 Ave & W 31 St|  17|      20190227|   64|
|3443|     W 52 St & 6 Ave|   8|      20190213|   63|
| 402|  Broadway & E 22 St|   9|      20190213|   63|
| 519|Pershing Square N...|  17|      20190213|   60|
|3255|     8 Ave & W 31 St|  16

In [40]:
print("Среднее число конца поездок в воскресенье в течение дня:")

start_wed_trips_hours = trips_new\
            .withColumn('weekday', date_format(F.col('starttime'), 'E'))\
            .withColumn('starttime_unix', date_format('starttime', "yyyyMMdd"))\
            .filter((F.col('weekday') == 'Sun'))\
            .select(F.col('end_station_id').alias('id'), 
                    F.col('end_station_name').alias('name'),
                    F.col("starttime_unix"), 
                    hour(F.col('starttime')).alias('hour'))\
            .groupBy('id','name', 'hour','starttime_unix')\
            .agg(F.count('hour').alias('count'))\
            .orderBy('count', ascending=False)
start_wed_trips_hours.show()

print("Ночью:")
start_wed_trips_hours\
        .filter(F.col('hour') < 6)\
        .filter(F.col('hour') >= 0)\
        .groupBy('id','name')\
        .agg(F.avg('count').alias('avg'))\
        .orderBy('avg', ascending=False).show()

print("Утром:")
start_wed_trips_hours\
    .filter(F.col('hour') < 12)\
    .filter(F.col('hour') >= 6)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Днем:")
start_wed_trips_hours\
    .filter(F.col('hour') < 18)\
    .filter(F.col('hour') >= 12)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('avg', ascending=False).show()

print("Вечером:")
start_wed_trips_hours\
    .filter(F.col('hour') < 24)\
    .filter(F.col('hour') >= 18)\
    .groupBy('id','name')\
    .agg(F.avg('count').alias('avg'))\
    .orderBy('id').show()

Среднее число конца поездок в воскресенье в течение дня:
+----+--------------------+----+--------------+-----+
|  id|                name|hour|starttime_unix|count|
+----+--------------------+----+--------------+-----+
| 499|  Broadway & W 60 St|  15|      20190203|   36|
| 285|  Broadway & E 14 St|  18|      20190224|   29|
| 285|  Broadway & E 14 St|  15|      20190203|   28|
| 350|Clinton St & Gran...|  17|      20190224|   28|
|2008|Little West St & ...|  14|      20190203|   27|
|3435|Grand St & Elizab...|  12|      20190217|   27|
| 499|  Broadway & W 60 St|  12|      20190217|   27|
| 285|  Broadway & E 14 St|  17|      20190224|   26|
| 435|     W 21 St & 6 Ave|  15|      20190203|   26|
|3016|   Kent Ave & N 7 St|  12|      20190203|   26|
|3256|Pier 40 - Hudson ...|  14|      20190203|   25|
| 285|  Broadway & E 14 St|  14|      20190203|   25|
| 334|     W 20 St & 7 Ave|  12|      20190203|   25|
|3435|Grand St & Elizab...|  13|      20190203|   24|
| 358|Christopher St & ..

2. отобразите полученные данные для второго случая в виде тепловой временной карты (HeatMapWithTime)

In [41]:
# сначала построим карту для среднего числа начала поездок
data_begin = []

data_begin.append(start_trips_night_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())
data_begin.append(start_trips_morning_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())
data_begin.append(start_trips_afternoon_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())
data_begin.append(start_trips_evening_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())

In [42]:
m = folium.Map()
HeatMapWithTime(data_begin).add_to(m)
m.fit_bounds(m.get_bounds())
embed_map(m)

In [43]:
# построим карту для среднего числа конца поездок
data_end = []

data_end.append(end_trips_night_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())
data_end.append(end_trips_morning_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())
data_end.append(end_trips_afternoon_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())
data_end.append(end_trips_evening_cnt.select(F.col('lat'),F.col('lng'),F.col('avg')).toPandas().values.tolist())

In [44]:
m = folium.Map()
HeatMapWithTime(data_end, radius=15).add_to(m)
m.fit_bounds(m.get_bounds())
embed_map(m)