## Logistic Regression - Model Fitting
### University of Virginia
### DS 5110: Big Data Systems
### By: TeamBike

**Creating logistic-regression model for the expanded neighborhood set.**

Session is 16 cores, 128 GB RAM

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### Create SparkSession (this configuration worked, but this file takes a really long time to run with all the data.

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder\
    .master("local") \
    .appName("models") \
    .config("spark.executor.memory", '60g') \
    .config('spark.executor.cores', '10') \
    .config('spark.cores.max', '12') \
    .config("spark.driver.memory",'40g') \
    .getOrCreate()



'''
spark = SparkSession \
    .builder\
    .master("local") \
    .appName("models") \
    .config("spark.executor.memory", '80g') \
    .config('spark.executor.cores', '16') \
    .config('spark.cores.max', '16') \
    .config("spark.driver.memory",'40g') \
    .getOrCreate()
'''

#spark = SparkSession \
#    .builder.getOrCreate()


sc = SparkSession.sparkContext

In [3]:
%matplotlib inline

#### Read `master_dataset` parquet, which includes all Citi Bike data, weather data, and real-estate data.

In [4]:
df = spark.read.parquet("/project/ds5559/Summer2021_TeamBike/master_dataset.parquet");
b_map = spark.read.csv("station_df2.csv", header = True).withColumnRenamed('startStationName', 'stationName');
#borough_plus = spark.read.csv("station_df2.csv", header = True).withColumnRenamed('startStationName', 'stationName');
print(type(df)) # I want to see what type of object this created to make sure it's a DF

<class 'pyspark.sql.dataframe.DataFrame'>


#### `b_map` contains the borough-neighborhood mapping that will incorporate the predictor variable into the data set.

In [5]:
b_map = b_map.drop('_c0').drop('prediction').drop('latitude').drop('longitude').drop('Borough')

In [6]:
df.printSchema()

root
 |-- Borough: string (nullable = true)
 |-- month_year: string (nullable = true)
 |-- startStationId: string (nullable = true)
 |-- startStationName: string (nullable = true)
 |-- startStationLatitude: double (nullable = true)
 |-- startStationLongitude: double (nullable = true)
 |-- date: string (nullable = true)
 |-- hour: integer (nullable = true)
 |-- tripduration: long (nullable = true)
 |-- starttime: string (nullable = true)
 |-- endStationId: double (nullable = true)
 |-- endStationName: string (nullable = true)
 |-- endStationLatitude: double (nullable = true)
 |-- endStationLongitude: double (nullable = true)
 |-- bikeid: double (nullable = true)
 |-- usertype: string (nullable = true)
 |-- birthyear: string (nullable = true)
 |-- gender: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- feels_like: double (nullable = true)
 |-- temp_min: double (nullable = true)
 |-- temp_max: double (nullable = true)
 |-- pressure: long (nullable = true)
 |-- humidity: 

In [7]:
full = df.columns # get list of all columns

In [8]:
df.show(1)

+--------+----------+--------------+-------------------+--------------------+---------------------+----------+----+------------+--------------------+------------+-------------------+------------------+-------------------+-------+--------+---------+------+------+----------+--------+--------+--------+--------+----------+-------+-------+-------+-------+----------+------------+---+------+-----+--------+------------+----+---------+---------+---------+--------------------+--------------------+--------+------------------+------------------+----------------------+----------------------+----------------------+-------------------+------------------+-------------------+------------------+--------------------+
| Borough|month_year|startStationId|   startStationName|startStationLatitude|startStationLongitude|      date|hour|tripduration|           starttime|endStationId|     endStationName|endStationLatitude|endStationLongitude| bikeid|usertype|birthyear|gender|  temp|feels_like|temp_min|temp_max|p

### Borough Ride Analysis

#### Where do rides start?
- Some initial analysis
- Number of rides and their relative frequency

In [9]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

tot = df.count()

df.groupby('Borough').count()\
  .withColumnRenamed('count', '# Rides') \
  .withColumn('% Total', F.round((F.col('# Rides') / tot) * 100, 2) ) \
  .show()

+---------+--------+-------+
|  Borough| # Rides|% Total|
+---------+--------+-------+
|   Queens| 1849398|   3.74|
| Brooklyn| 9574892|  19.38|
|Manhattan|37688969|   76.3|
|    Bronx|  280961|   0.57|
+---------+--------+-------+



### Notice that there are no rides in Staten Island due to there being no stations located there.

- Number of stations in each borough

In [10]:
tot = df.select('startStationName', 'Borough')\
.distinct().count()

df.select('startStationName', 'Borough')\
.distinct().groupby('Borough').count()\
  .withColumnRenamed('count', '# Stations') \
  .withColumn('% Total', F.round((F.col('# Stations') / tot) * 100, 2) ) \
  .show()

+---------+----------+-------+
|  Borough|# Stations|% Total|
+---------+----------+-------+
|   Queens|       179|  11.72|
| Brooklyn|       471|  30.84|
|Manhattan|       651|  42.63|
|    Bronx|       226|   14.8|
+---------+----------+-------+



### There is an outsized number of rides in Manhattan and Brooklyn relative to the number of stations there are in these two boroughs. 

In [11]:
import pyspark.sql.functions

df = df.withColumn('month', pyspark.sql.functions.split(df['month_year'], '-').getItem(1))

In [12]:
df_rf = df.select('month_year', 'startStationName', 'hour', 'tripduration', 'endStationName', 'day', 'month', 'time_bin', 'peak_commute', )

In [13]:
df_rf.show(10)

+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+
|month_year|    startStationName|hour|tripduration|      endStationName|   day|month|time_bin|peak_commute|
+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+
|   2020-06| Douglass St & 4 Ave|   0|        1062| Douglass St & 4 Ave|Monday|   06|   Night|    non_peak|
|   2020-06|Clinton Ave & Myr...|   0|        3810|Sullivan St & Was...|Monday|   06|   Night|    non_peak|
|   2020-06| Broadway & Berry St|   0|        1017|Classon Ave & St ...|Monday|   06|   Night|    non_peak|
|   2020-06|E 53 St & Lexingt...|   0|        1437|  E 17 St & Broadway|Monday|   06|   Night|    non_peak|
|   2020-06|Laight St & Hudso...|   0|         355|Perry St & Bleeck...|Monday|   06|   Night|    non_peak|
|   2020-06|Duffield St & Wil...|   0|          87| Bond St & Fulton St|Monday|   06|   Night|    non_peak|
|   2020-06|Fulton St & Broa

In [14]:
from pyspark.sql.types import IntegerType
df_rf = df_rf.withColumn("month", df_rf["month"].cast(IntegerType()))#.withColumnRenamed('startStationName', 'station')

In [15]:
df_rf.show(10)

+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+
|month_year|    startStationName|hour|tripduration|      endStationName|   day|month|time_bin|peak_commute|
+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+
|   2020-06| Douglass St & 4 Ave|   0|        1062| Douglass St & 4 Ave|Monday|    6|   Night|    non_peak|
|   2020-06|Clinton Ave & Myr...|   0|        3810|Sullivan St & Was...|Monday|    6|   Night|    non_peak|
|   2020-06| Broadway & Berry St|   0|        1017|Classon Ave & St ...|Monday|    6|   Night|    non_peak|
|   2020-06|E 53 St & Lexingt...|   0|        1437|  E 17 St & Broadway|Monday|    6|   Night|    non_peak|
|   2020-06|Laight St & Hudso...|   0|         355|Perry St & Bleeck...|Monday|    6|   Night|    non_peak|
|   2020-06|Duffield St & Wil...|   0|          87| Bond St & Fulton St|Monday|    6|   Night|    non_peak|
|   2020-06|Fulton St & Broa

## Join the neighborhoods created by the K-Means algorithm (in `b_map`) to the rest of the data:

In [16]:
df_rf = df_rf.join(b_map, on = df_rf.startStationName == b_map.stationName).withColumnRenamed('concat', 'start_neighborhood')

In [17]:
df_rf.show(5)

+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+--------------------+--------------------+
|month_year|    startStationName|hour|tripduration|      endStationName|   day|month|time_bin|peak_commute|         stationName|  start_neighborhood|
+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+--------------------+--------------------+
|   2020-06| Douglass St & 4 Ave|   0|        1062| Douglass St & 4 Ave|Monday|    6|   Night|    non_peak| Douglass St & 4 Ave|          Midtown BK|
|   2020-06|Clinton Ave & Myr...|   0|        3810|Sullivan St & Was...|Monday|    6|   Night|    non_peak|Clinton Ave & Myr...|          Midtown BK|
|   2020-06| Broadway & Berry St|   0|        1017|Classon Ave & St ...|Monday|    6|   Night|    non_peak| Broadway & Berry St|           Uptown BK|
|   2020-06|E 53 St & Lexingt...|   0|        1437|  E 17 St & Broadway|Monday|    6|   Night|    no

In [18]:
df_rf = df_rf.drop('stationName')

In [19]:
df_rf.count()

49614503

### There is now a column of neighborhood values in the dataframe. 

### Similar analysis performed on the boroughs seen below for the neighborhoods:

In [20]:
tot = df_rf.select('start_neighborhood').count()

df_rf.select('start_neighborhood').groupby('start_neighborhood').count()\
  .withColumnRenamed('count', '# Rides') \
  .withColumn('% Total', F.round((F.col('# Rides') / tot) * 100, 2) ) \
  .show()

+--------------------+--------+-------+
|  start_neighborhood| # Rides|% Total|
+--------------------+--------+-------+
|              Queens| 1900644|   3.83|
|         Downtown BK|  491417|   0.99|
|Central Park East...| 8340058|  16.81|
|  Downtown Manhattan|10076749|  20.31|
|Harlem & Wash. He...|  147133|    0.3|
|          Midtown BK| 5202790|  10.49|
|           Uptown BK| 3949040|   7.96|
|   Midtown Manhattan|16070516|  32.39|
|    Uptown Manhattan| 3155195|   6.36|
|               Bronx|  280961|   0.57|
+--------------------+--------+-------+



In [21]:
tot = df_rf.select('startStationName')\
.distinct().count()

df_rf.select('start_neighborhood', 'startStationName')\
.distinct().groupby('start_neighborhood').count()\
  .withColumnRenamed('count', '# Stations') \
  .withColumn('% Total', F.round((F.col('# Stations') / tot) * 100, 2) ) \
  .show()

+--------------------+----------+-------+
|  start_neighborhood|# Stations|% Total|
+--------------------+----------+-------+
|              Queens|       179|  11.75|
|         Downtown BK|       116|   7.62|
|Central Park East...|       137|    9.0|
|  Downtown Manhattan|       138|   9.06|
|Harlem & Wash. He...|        67|    4.4|
|          Midtown BK|       172|  11.29|
|           Uptown BK|       183|  12.02|
|   Midtown Manhattan|       171|  11.23|
|    Uptown Manhattan|       138|   9.06|
|               Bronx|       226|  14.84|
+--------------------+----------+-------+



### We still see some imbalanced data when considering the number of rides, but the number of stations is relatively spread across all of the neighborhoods.

**Bring in the neighborhoods for the end stations:**

In [22]:
df_rf = df_rf.join(b_map, on = df_rf.endStationName == b_map.stationName).withColumnRenamed('concat', 'end_neighborhood').drop('stationName')

In [23]:
df_rf.show(5)

+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+--------------------+------------------+
|month_year|    startStationName|hour|tripduration|      endStationName|   day|month|time_bin|peak_commute|  start_neighborhood|  end_neighborhood|
+----------+--------------------+----+------------+--------------------+------+-----+--------+------------+--------------------+------------------+
|   2020-06| Douglass St & 4 Ave|   0|        1062| Douglass St & 4 Ave|Monday|    6|   Night|    non_peak|          Midtown BK|        Midtown BK|
|   2020-06|Clinton Ave & Myr...|   0|        3810|Sullivan St & Was...|Monday|    6|   Night|    non_peak|          Midtown BK|Downtown Manhattan|
|   2020-06| Broadway & Berry St|   0|        1017|Classon Ave & St ...|Monday|    6|   Night|    non_peak|           Uptown BK|        Midtown BK|
|   2020-06|E 53 St & Lexingt...|   0|        1437|  E 17 St & Broadway|Monday|    6|   Night|    non_peak|Centr

In [24]:
df_rf.count()

49271009

In [25]:
df_rf = df_rf.drop('month_year')

In [26]:
df_rf.show(5)

+--------------------+----+------------+--------------------+------+-----+--------+------------+--------------------+------------------+
|    startStationName|hour|tripduration|      endStationName|   day|month|time_bin|peak_commute|  start_neighborhood|  end_neighborhood|
+--------------------+----+------------+--------------------+------+-----+--------+------------+--------------------+------------------+
| Douglass St & 4 Ave|   0|        1062| Douglass St & 4 Ave|Monday|    6|   Night|    non_peak|          Midtown BK|        Midtown BK|
|Clinton Ave & Myr...|   0|        3810|Sullivan St & Was...|Monday|    6|   Night|    non_peak|          Midtown BK|Downtown Manhattan|
| Broadway & Berry St|   0|        1017|Classon Ave & St ...|Monday|    6|   Night|    non_peak|           Uptown BK|        Midtown BK|
|E 53 St & Lexingt...|   0|        1437|  E 17 St & Broadway|Monday|    6|   Night|    non_peak|Central Park East...| Midtown Manhattan|
|Laight St & Hudso...|   0|         355|P

### We now have the starting neighborhoods and ending neighborhoods for each ride in the full dataset. Let's look at some ride details below:

In [27]:
from pyspark.sql.functions import desc

tot = df_rf.count()

df_rf.groupby('start_neighborhood', 'end_neighborhood').count()\
  .withColumnRenamed('count', '# Rides') \
  .withColumn('% Total', F.round((F.col('# Rides') / tot) * 100, 2) ) \
  .sort(desc("count")).show(100)

+--------------------+--------------------+--------+-------+
|  start_neighborhood|    end_neighborhood| # Rides|% Total|
+--------------------+--------------------+--------+-------+
|   Midtown Manhattan|   Midtown Manhattan|10154874|  20.61|
|  Downtown Manhattan|  Downtown Manhattan| 5763571|   11.7|
|Central Park East...|Central Park East...| 4880562|   9.91|
|          Midtown BK|          Midtown BK| 3825678|   7.76|
|  Downtown Manhattan|   Midtown Manhattan| 3040322|   6.17|
|   Midtown Manhattan|  Downtown Manhattan| 3014183|   6.12|
|           Uptown BK|           Uptown BK| 2735445|   5.55|
|   Midtown Manhattan|Central Park East...| 2108910|   4.28|
|    Uptown Manhattan|    Uptown Manhattan| 1962820|   3.98|
|Central Park East...|   Midtown Manhattan| 1962534|   3.98|
|              Queens|              Queens| 1411336|   2.86|
|    Uptown Manhattan|Central Park East...|  851787|   1.73|
|Central Park East...|    Uptown Manhattan|  844450|   1.71|
|           Uptown BK|  

### Most rides start and end in the same neighborhood; 63% start and end in the same neighborhood

In [28]:
df_rf.where(df_rf.start_neighborhood == df_rf.end_neighborhood).count()/df_rf.count()
#.groupby('start_neighborhood', 'end_neighborhood').count()#sort(desc("count")).show(100)

0.6328590104578536

In [29]:
df_group = df_rf.select('start_neighborhood', 'end_neighborhood', 'day', 'time_bin', 'peak_commute', 'month', 'hour')#.avg('tripduration')

## Start modeling pre-processing
- Predict end neighborhood from the start neighborhood, day of the week, time bin (morning, evening, etc.), whether the ride started at peak commute, and the month in which it was taken.

### Create the pipeline to transform the data for modeling

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

cats = ['day', 'time_bin', 'peak_commute', 'month', 'start_neighborhood']

# The index of string vlaues multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in cats
]

# The encode of indexed vlaues multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

# Vectorizing encoded values
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],outputCol="features")

label_indexer = StringIndexer(inputCol='end_neighborhood', outputCol= 'n_index')

label_encoder = OneHotEncoder(inputCol='n_index',outputCol= 'label')
                              
#label_assember = VectorAssembler(inputCols=,outputCol="label")

pipeline = Pipeline(stages=indexers + [label_indexer] + encoders+ [label_encoder] + [assembler])
model=pipeline.fit(df_group)
transformed = model.transform(df_group)
transformed.show(5)

**To avoid having to transform the data over and over, I saved the data to parquet files and loaded from there.**

In [30]:
#transformed.write.parquet("pipelined_data_NBs.parquet")

In [31]:
transformed = spark.read.parquet("pipelined_data_NBs.parquet");

#### Split the transformed data:

In [33]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier

#randomly split data into training and test dataset
(train_data, test_data, hold_out) = transformed.randomSplit([0.60, 0.20, 0.20], seed = 33)

In [34]:
transformed.show(5)

+--------------------+--------------------+--------+---------+------------+-----+----+-----------+----------------+--------------------+-------------+--------------------------+-------+-------------------+------------------------+----------------------------+---------------------+----------------------------------+-------------+--------------------+
|  start_neighborhood|    end_neighborhood|     day| time_bin|peak_commute|month|hour|day_indexed|time_bin_indexed|peak_commute_indexed|month_indexed|start_neighborhood_indexed|n_index|day_indexed_encoded|time_bin_indexed_encoded|peak_commute_indexed_encoded|month_indexed_encoded|start_neighborhood_indexed_encoded|        label|            features|
+--------------------+--------------------+--------+---------+------------+-----+----+-----------+----------------+--------------------+-------------+--------------------------+-------+-------------------+------------------------+----------------------------+---------------------+---------------

# LR Model with Cross Validation

### Run the model on the training data; too many folds in the CV process and too many grid elements proved to be computationally expensive, so I scaled back

In [35]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [36]:
lr = LogisticRegression(labelCol="n_index", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0)

evaluator = MulticlassClassificationEvaluator(
    labelCol="n_index", predictionCol="prediction", metricName="accuracy")

# Create ParamGrid for Cross Validation

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=3)

cvModel = cv.fit(train_data)

### Save the model and predict on the test data

In [39]:
cvModel.save("lr_nbs")

In [42]:
predictions = cvModel.transform(test_data)

In [45]:
predictions.write.parquet("lr_preds_NBs.parquet")

### Important output files are saved for later analysis so this model does not have to be run again. 