In [1]:
# medir tiempos
%load_ext autotime

time: 1.3 ms (started: 2021-05-24 19:07:33 -05:00)


![output](tiempo_secuencial.png)

## Paralelo pyspark 

In [2]:
# Modulo para encontrar pyspark
import findspark
findspark.init("/usr/local/spark/spark-3.1.1-bin-hadoop2.7")

time: 148 ms (started: 2021-05-24 19:07:34 -05:00)


In [3]:
# importamos pyspark
from pyspark import SparkConf, SparkContext
# Variable de configuración
conf = SparkConf().setMaster("local").setAppName("ModeloML")
# iniciamos un contexto spark (solo se ejecuta uno. Para ejecutar otra vez , reiniciar el kernel)
sc = SparkContext(conf = conf)
sc

time: 17.7 s (started: 2021-05-24 19:07:35 -05:00)


#### DataFrame spark

In [5]:
from pyspark.sql.types import StringType
from pyspark import SQLContext
# le pasamos el contexto anterior
sqlContext = SQLContext(sc)
dfspark = sqlContext.read.format('csv').option("header","true").option("inferSchema","true").load('train.csv')

time: 2min 49s (started: 2021-05-24 19:08:55 -05:00)


In [6]:
# Esquema de los datos
dfspark.printSchema()

root
 |-- key: string (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)

time: 10.1 ms (started: 2021-05-24 19:11:45 -05:00)


In [7]:
#cantidad de datos
cant_total=dfspark.count()

time: 25.7 s (started: 2021-05-24 19:11:45 -05:00)


In [8]:
# Tomar una muestra
dfspark_sample = dfspark.sample(fraction = 0.04, withReplacement = False)

time: 9.45 ms (started: 2021-05-24 19:12:11 -05:00)


### Apartir de aqui trabajamos con una muestra.

In [9]:
# cantidad de muestra
cant_muestra=dfspark_sample.count()

time: 25.5 s (started: 2021-05-24 19:12:11 -05:00)


# Limpieza de la data

###### Eliminando Valores Nulos de la tabla

In [10]:
#fare_amount no nulo
dfspark_sample = dfspark_sample.filter("fare_amount is not NULL")
# pasajeros no nulo
dfspark_sample = dfspark_sample.filter("passenger_count is not NULL")
#pickup_datetime
dfspark_sample = dfspark_sample.filter("pickup_datetime is not NULL")
#pickup
dfspark_sample = dfspark_sample.filter("pickup_longitude is not NULL")
dfspark_sample = dfspark_sample.filter("pickup_latitude is not NULL")
# dropoff
dfspark_sample = dfspark_sample.filter("dropoff_longitude is not NULL")
dfspark_sample = dfspark_sample.filter("dropoff_latitude is not NULL")

time: 244 ms (started: 2021-05-24 19:12:36 -05:00)


###### Eliminado valores nan y duplicados

In [11]:
# tabla sin valores nan, sin duplicados
dfspark_sample=dfspark_sample.na.drop().dropDuplicates()
# cantidad de  data sin nulos ni nan
cantnn_muestra=dfspark_sample.count()
print("cantidad de valores nulos: ",cant_muestra-cantnn_muestra)

cantidad de valores nulos:  11
time: 2min 34s (started: 2021-05-24 19:12:36 -05:00)


In [12]:
dfspark_sample.printSchema()

root
 |-- key: string (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)

time: 10 ms (started: 2021-05-24 19:15:11 -05:00)


In [13]:
# numero de particiones en las que se dividio la data.
dfspark_sample.rdd.getNumPartitions()

200

time: 401 ms (started: 2021-05-24 19:15:11 -05:00)


In [14]:
#que la data persista en memoria, acelera algunos procesos.
dfspark_sample.persist()

DataFrame[key: string, fare_amount: double, pickup_datetime: string, pickup_longitude: double, pickup_latitude: double, dropoff_longitude: double, dropoff_latitude: double, passenger_count: int]

time: 114 ms (started: 2021-05-24 19:15:12 -05:00)


# Estadísticas

In [15]:
import numpy as np
import pandas as pd

time: 311 ms (started: 2021-05-24 19:15:12 -05:00)


In [16]:
#casteamos a pandas
summary=dfspark_sample.describe(["pickup_longitude",
                                 "pickup_latitude",
                                 "dropoff_longitude",
                                 "dropoff_latitude",
                                 "passenger_count",
                                 "fare_amount"]).toPandas()

time: 2min 59s (started: 2021-05-24 19:15:12 -05:00)


In [17]:
# Tipos de datos del summary
summary.dtypes

summary              object
pickup_longitude     object
pickup_latitude      object
dropoff_longitude    object
dropoff_latitude     object
passenger_count      object
fare_amount          object
dtype: object

time: 5.65 ms (started: 2021-05-24 19:18:12 -05:00)


In [18]:
summary

Unnamed: 0,summary,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount
0,count,2219013.0,2219013.0,2219013.0,2219013.0,2219013.0,2219013.0
1,mean,-72.49893468358957,39.91588546786556,-72.49993181216463,39.90892687019288,1.6843168561878636,11.346452639078764
2,stddev,12.292290172231276,9.366487712029612,11.8250760373813,9.140430689717409,1.343318841289577,9.833546482396764
3,min,-3019.803228,-3114.472102,-2260.620487,-3111.70716,0.0,-105.0
4,max,3442.185068,3310.364462,3442.185068,3375.202165,208.0,1564.5


time: 161 ms (started: 2021-05-24 19:18:12 -05:00)


In [19]:
# Agrupación por cantidad de pasajeros.
dfspark_sample.groupBy("passenger_count").count().show()

+---------------+-------+
|passenger_count|  count|
+---------------+-------+
|              1|1535287|
|              6|  46771|
|              3|  97350|
|              5| 157035|
|              4|  46910|
|              7|      1|
|              2| 327848|
|              0|   7806|
|            208|      5|
+---------------+-------+

time: 6.59 s (started: 2021-05-24 19:18:12 -05:00)


observaciones :
1. Valores de passenger_count imposibles , como 34,49,51,129,208.
2. Precios demasiados elevados debido a la cantidad de pasajeros y negativo(imposible).
3. cantidad de datos en la que el precio es menor a o igual a 0

# Transformación de la data

In [None]:
#Seleccionar passenger_count de 0-9
dfspark_sample = dfspark_sample.filter("passenger_count < 10")
#Selecionar fare_amount mayor a 0
dfspark_sample = dfspark_sample.filter("fare_amount >= 0")

In [None]:
dfspark_sample.groupBy("passenger_count").count().show()

In [None]:
# Agregamos columnas de diferencias.
from pyspark.sql.functions import abs
dfspark_sample = dfspark_sample.withColumn("dif_latitude",
                                           abs(dfspark_sample['dropoff_latitude']-dfspark_sample['pickup_latitude']))
dfspark_sample = dfspark_sample.withColumn("dif_longitude",
                                           abs(dfspark_sample['dropoff_longitude']-dfspark_sample['pickup_longitude']))

#### Crear dos columnas día de la semana y hora del viaje.

In [None]:
# funciones que me ayudarán en la transformación.
from datetime import datetime, date, time, timedelta
import calendar
def dia(dia):
    if dia == 1:
        return 'lunes'
    if dia == 2:
        return 'martes'
    if dia == 3:
        return 'miércoles'
    if dia == 4:
        return 'jueves'
    if dia == 5:
        return 'viernes'
    if dia == 6:
        return 'sábado'
    if dia == 7:
        return 'domingo'
    if dia < 1 or dia > 7:
        return 

from pyspark.sql import Row

def dia_semana(row):
    fecha , hora , utc = row.split(" ")
    formato_fecha = "%Y-%m-%d"
    dia_semana = datetime.isoweekday(datetime.strptime(fecha,formato_fecha))
    return dia_semana

def hora(row):
    fecha , hora , utc = row.split(" ")
    formato_hora = "%H:%M:%S"
    hora = datetime.strptime(hora,formato_hora).hour
    return hora
    

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
# convirtiendo las funciones en funciones UDF
udf_dia_semana= udf( lambda z : dia_semana(z))
udf_hora= udf( lambda z : hora(z))

In [None]:
from pyspark.sql.functions import col
dfspark_sample = dfspark_sample.withColumn('dia_semana', 
                                           udf_dia_semana(dfspark_sample['pickup_datetime'] )  )
dfspark_sample = dfspark_sample.withColumn('hora', 
                                           udf_hora(dfspark_sample['pickup_datetime'] )  )

In [None]:
# Castear dia de la semana y hora
dfspark_sample = dfspark_sample.withColumn("dia_semana",
                                           dfspark_sample["dia_semana"].cast("Integer"))
dfspark_sample = dfspark_sample.withColumn("hora",
                                           dfspark_sample["hora"].cast("Integer"))

##### Observación: En este punto la data estaria totalmente ok.

### Estadística de las nuevas variables

In [None]:
clean_summary=dfspark_sample.describe().toPandas()

In [None]:
columnas = ["pickup_longitude",
            "pickup_latitude",
            "dropoff_longitude",
            "dropoff_latitude",
            "passenger_count",
            "fare_amount",
            "dif_latitude",
            "dif_longitude",
            "dia_semana",
            "hora"]
clean_summary[columnas]

### Correlación de los atributos

In [None]:
col_old=["pickup_longitude",
         "pickup_latitude",
         "dropoff_longitude",
         "dropoff_latitude",
         "passenger_count"]
col_new=["dif_latitude",
         "dif_longitude",
         "dia_semana",
         "hora"]
col_pred = ["fare_amount"]

In [None]:
# Transformación en un vector
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
inputCols = col_old+col_new+col_pred
assembler = VectorAssembler( inputCols=inputCols, outputCol="col_corr")
dfspark_corr = assembler.transform(dfspark_sample)

In [None]:
dfspark_corr=dfspark_corr.select('col_corr')

In [None]:
# estudiar la correlación
from pyspark.ml.stat import Correlation

dfspark_corr= Correlation.corr(dfspark_corr, 'col_corr','pearson')

In [None]:
from pyspark.ml.linalg import DenseMatrix, Vectors
#type(dfspark_cor.collect()[0][0]) # denseMatrix
# Pasamos la matrix como un array.
array_corr=dfspark_corr.collect()[0][0].toArray()

In [None]:
pdf_corr= pd.DataFrame(array_corr, columns=inputCols, index=inputCols)
mask = ~(pdf_corr>-0.3) | ~(pdf_corr<0.3)
round(pdf_corr,10).style.background_gradient()

### Observación:
###### La variable a predecir fare_amount itnee muy baja correlación con las demás variables. Su mayor correlación es con passenger_count y hora. Pero si observamos correlación entre las demás variables. Definitivamente no podemos utilizar un modelo lineal.

### Visualización de la data.

In [None]:
# Pasamos toda la data a Pandas
pandasData = dfspark_sample.toPandas()

In [None]:
#Manera alternativa de pasar la data a Pandas
# import pandas as pd
# from pyspark.sql import DataFrame

# # Wrapper for seamless Spark's serialisation
# def spark_to_pandas(spark_df: DataFrame) -> pd.DataFrame:
#     """
#     PySpark toPandas realisation using mapPartitions
#     much faster than vanilla version
#     fork: https://gist.github.com/lucidyan/1e5d9e490a101cdc1c2ed901568e082b
#     origin: https://gist.github.com/joshlk/871d58e01417478176e7
#     :param spark_df:
#     :return:
#     """
    
#     def _map_to_pandas(rdds) -> list:
#         """ Needs to be here due to pickling issues """
#         return [pd.DataFrame(list(rdds))]

#     def _to_pandas(df: DataFrame, n_partitions: int = None) -> pd.DataFrame:
#         """
#         Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
#         repartitioned if `n_partitions` is passed.
#         :param df:
#         :param n_partitions:
#         :return:
#         """
#         if n_partitions is not None:
#             df = df.repartition(n_partitions)
#         df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()  # type: pd.DataFrame
#         df_pand = pd.concat(df_pand)
#         df_pand.columns = df.columns
#         return df_pand

#     return _to_pandas(spark_df)

# pandasData = spark_to_pandas(dfspark_sample)

In [None]:
#Mostrando la data
display(pandasData)

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks" , color_codes = True)
var = ["dia_semana"]
# 1 inches = 96px
g = sns.pairplot(pandasData[:10], vars=var, diag_kind="hist", hue='dia_semana', height=4, aspect=4)

In [None]:
#Cuanto de la variable objetivo va variando segun los dias de la semana
import matplotlib.pyplot as plt

carac1=pandasData['dia_semana']
objet=pandasData['fare_amount']
plt.plot(carac1, objet, 'o')
plt.xlabel("Característica dia de la semana")
plt.ylabel("Objetivo")

In [None]:
#como los pasajeros se distribuyen a traves de las horas
pandasData.groupby('hora')['passenger_count'].sum().plot(kind='barh',legend='Reverse',figsize=(10,10))
plt.xlabel('')

In [None]:
#como se reparten los pasajeros en funcion de la hora
pandasData.passenger_count.groupby(pandasData.hora).sum().plot(kind='pie',cmap='Paired',figsize=(12,8))
plt.axis('equal')

In [None]:
#Dispersion de las características
caracteristicas=pandasData[['fare_amount','pickup_longitude','pickup_latitude','dropoff_longitude']]
sns.set(style="ticks", color_codes=True)
g= sns.pairplot(caracteristicas,hue='fare_amount',palette='Spectral')

In [20]:
# cerramos la sesión spark
sc.stop()

time: 231 ms (started: 2021-05-24 19:18:18 -05:00)
