In [1]:
# medir tiempos
%load_ext autotime

time: 0 ns (started: 2021-06-08 13:59:39 -05:00)


![output](tiempo_secuencial.png)

### Importando la data usando Pyspark 

In [1]:
# Modulo para encontrar pyspark
import findspark
#findspark.init("/usr/local/spark/spark-3.1.1-bin-hadoop2.7")    #para linux
findspark.init()                                                 #para windows

In [2]:
# importamos pyspark
from pyspark import SparkConf, SparkContext
# Variable de configuración
conf = SparkConf().setMaster("local[*]").setAppName("ModeloML")
# iniciamos un contexto spark (solo se ejecuta uno. Para ejecutar otra vez , reiniciar el kernel)
sc = SparkContext(conf = conf)
sc

#### DataFrame spark

In [3]:
from pyspark.sql.types import StringType
from pyspark import SQLContext
# le pasamos el contexto anterior
sqlContext = SQLContext(sc)
dfspark = sqlContext.read.format('csv').option("header","true").option("inferSchema","true").load('train.csv')

In [8]:
# Esquema de los datos
dfspark.printSchema()

root
 |-- key: string (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)



In [9]:
# Total de cantidad de datos
cant_total=dfspark.count()
cant_total

55423856

In [10]:
# Tomando una muestra del 4% del total
dfspark_sample = dfspark.sample(fraction = 0.04, withReplacement = False, seed=0)

##### Apartir de aqui trabajamos con una muestra.

In [11]:
# Cantidad de datos para la muestra
cant_muestra=dfspark_sample.count()
cant_muestra

2219408

### Limpieza de la data

Se procederá a eliminar la columna con la característica "key", debido a que contiene datos innecesarios para lograr el objetivo.

In [12]:
dfspark_sample = dfspark_sample.drop('key')

###### Eliminando Valores Nulos de la tabla

In [13]:
# fare_amount (costo de viaje) no nulos
dfspark_sample = dfspark_sample.filter("fare_amount is not NULL")
# passenger (número de pasajeros) no nulos
dfspark_sample = dfspark_sample.filter("passenger_count is not NULL")
# pickup_datetime (fecha y hora de incio de viaje) no nulos
dfspark_sample = dfspark_sample.filter("pickup_datetime is not NULL")
# pickup (longitud y latitud de inicio de viaje) no nulos
dfspark_sample = dfspark_sample.filter("pickup_longitude is not NULL")
dfspark_sample = dfspark_sample.filter("pickup_latitude is not NULL")
# dropoff (longitud y laitud de fin de viaje) no nulos
dfspark_sample = dfspark_sample.filter("dropoff_longitude is not NULL")
dfspark_sample = dfspark_sample.filter("dropoff_latitude is not NULL")

In [14]:
ncant_muestra=dfspark_sample.count()
ncant_muestra

2219390

###### Eliminado valores nan y duplicados

In [15]:
# tabla sin valores nan, sin duplicados
dfspark_sample=dfspark_sample.na.drop().dropDuplicates()
# cantidad de  data sin nulos ni nan
cantnn_muestra=dfspark_sample.count()
print("Cantidad de filas eliminadas de la muestra: ",cant_muestra-cantnn_muestra)

Cantidad de filas eliminadas de la muestra:  18


In [16]:
# que la data persista en memoria, acelera algunos procesos.
dfspark_sample.persist()

DataFrame[fare_amount: double, pickup_datetime: string, pickup_longitude: double, pickup_latitude: double, dropoff_longitude: double, dropoff_latitude: double, passenger_count: int]

### Estadísticas

In [17]:
import numpy as np
import pandas as pd

In [18]:
# casteamos a pandas
summary=dfspark_sample.describe(["pickup_longitude",
                                 "pickup_latitude",
                                 "dropoff_longitude",
                                 "dropoff_latitude",
                                 "passenger_count",
                                 "fare_amount"]).toPandas()

In [19]:
summary

Unnamed: 0,summary,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount
0,count,2219390.0,2219390.0,2219390.0,2219390.0,2219390.0,2219390.0
1,mean,-72.50355231460159,39.91779369363991,-72.50697030839864,39.91850726486102,1.6851959322156087,11.352503967306385
2,stddev,13.098889190132796,9.821243657749829,13.244549098861285,10.317917931047642,1.3377753769552685,42.44902507908096
3,min,-3366.527908,-3488.079513,-3366.527908,-3488.079513,0.0,-52.0
4,max,2497.117435,2964.163855,3211.57975,3333.304575,208.0,61550.86


In [20]:
# Agrupación por cantidad de pasajeros.
dfspark_sample.groupBy("passenger_count").count().show()

+---------------+-------+
|passenger_count|  count|
+---------------+-------+
|              1|1535882|
|              6|  47132|
|              3|  97242|
|              5| 157254|
|              4|  47158|
|              7|      1|
|              2| 326983|
|              0|   7734|
|            208|      4|
+---------------+-------+



### Observaciones :
1. Valores de passenger_count imposibles , como 34,49,51,129,208.
2. Precios demasiados elevados debido a la cantidad de pasajeros y negativo(imposible).
3. Cantidad de datos en la que el precio es menor a o igual a 0

In [21]:
# mínimo y máximo para longitud del subconjunto de datos
long_min_i=dfspark_sample.agg({'pickup_longitude': 'min'}).show()
long_max_i=dfspark_sample.agg({'pickup_longitude': 'max'}).show()
long_min_f=dfspark_sample.agg({'dropoff_longitude': 'min'}).show()
long_max_f=dfspark_sample.agg({'dropoff_longitude': 'max'}).show()

# mínimo y máximo para para latitud del subconjuntos de datos
lat_min_i=dfspark_sample.agg({'pickup_latitude': 'min'}).show()
lat_max_i=dfspark_sample.agg({'pickup_latitude': 'max'}).show()
lat_min_f=dfspark_sample.agg({'dropoff_latitude': 'min'}).show()
lat_max_f=dfspark_sample.agg({'dropoff_latitude': 'max'}).show()

+---------------------+
|min(pickup_longitude)|
+---------------------+
|         -3366.527908|
+---------------------+

+---------------------+
|max(pickup_longitude)|
+---------------------+
|          2497.117435|
+---------------------+

+----------------------+
|min(dropoff_longitude)|
+----------------------+
|          -3366.527908|
+----------------------+

+----------------------+
|max(dropoff_longitude)|
+----------------------+
|            3211.57975|
+----------------------+

+--------------------+
|min(pickup_latitude)|
+--------------------+
|        -3488.079513|
+--------------------+

+--------------------+
|max(pickup_latitude)|
+--------------------+
|         2964.163855|
+--------------------+

+---------------------+
|min(dropoff_latitude)|
+---------------------+
|         -3488.079513|
+---------------------+

+---------------------+
|max(dropoff_latitude)|
+---------------------+
|          3333.304575|
+---------------------+



### Observaciones:
1. Valores para longitud imposibles, ya que longitud varía entre -90 y 90
2. Valores para latitud imposibles, ya que latitud varía entre -180 y 180

# Transformación de la data

In [22]:
# Seleccionar passenger_count de 0-9
dfspark_sample = dfspark_sample.filter("passenger_count < 10")
# Selecionar fare_amount mayor a 0
dfspark_sample = dfspark_sample.filter("fare_amount >= 0")

In [23]:
dfspark_sample.groupBy("passenger_count").count().show()

+---------------+-------+
|passenger_count|  count|
+---------------+-------+
|              1|1535823|
|              6|  47129|
|              3|  97238|
|              5| 157246|
|              4|  47155|
|              7|      1|
|              2| 326972|
|              0|   7734|
+---------------+-------+



In [24]:
# Filtrando valores grandes para longitud, de tal manera que solo se considerará valores correctos.
dfspark_sample = dfspark_sample.filter("pickup_longitude < 180 and pickup_longitude > -180" )
dfspark_sample = dfspark_sample.filter("dropoff_longitude < 180 and dropoff_longitude > -180")
dfspark_sample = dfspark_sample.filter("pickup_latitude < 90 and pickup_latitude > -90" )
dfspark_sample = dfspark_sample.filter("dropoff_latitude < 90 and dropoff_latitude > -90")

In [25]:
summary_new=dfspark_sample.describe(["pickup_longitude",
                                 "pickup_latitude",
                                 "dropoff_longitude",
                                 "dropoff_latitude"]).toPandas()

In [26]:
summary_new

Unnamed: 0,summary,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,count,2219173.0,2219173.0,2219173.0,2219173.0
1,mean,-72.49600158477304,39.91679455251933,-72.50229931133802,39.91968272022599
2,stddev,10.463045755203852,6.112153680008393,10.439016533276371,6.104146217099324
3,min,-121.9149932861328,-74.017222,-121.9151840209961,-74.177303
4,max,40.840962,74.007413,73.93996,74.95


In [27]:
# Agregamos columnas de diferencias.
from pyspark.sql.functions import abs
dfspark_sample = dfspark_sample.withColumn("dif_latitude",
                                           abs(dfspark_sample['dropoff_latitude']-dfspark_sample['pickup_latitude']))
dfspark_sample = dfspark_sample.withColumn("dif_longitude",
                                           abs(dfspark_sample['dropoff_longitude']-dfspark_sample['pickup_longitude']))

In [28]:
# Creamos la función para hallar la distancia entre dos puntos geográficos
import math
from pyspark.sql.functions import udf, array, col
from pyspark.sql.types import FloatType

def haversine(x):
    lat1=x[0]
    lon1=x[1]
    lat2=x[2]
    lon2=x[3]
    
    rad=math.pi/180
    dlat=lat2-lat1
    dlon=lon2-lon1
    R=6372.795477598
    a=(math.sin(rad*dlat/2))**2 + math.cos(rad*lat1)*math.cos(rad*lat2)*(math.sin(rad*dlon/2))**2
    distancia=2*R*math.asin(math.sqrt(a))
    return distancia

distancia_udf = udf(lambda z: haversine(z), FloatType())
#spark.udf.register('distancia_udf', distancia_udf)
dfspark_sample = dfspark_sample.withColumn('distancia', distancia_udf(array('pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')))                             

In [29]:
dfspark_sample.select(col('distancia')).show()

+----------+
| distancia|
+----------+
| 1.8683056|
| 1.8191968|
|  1.145462|
| 1.0793539|
| 1.1367035|
|  2.424119|
|  3.829884|
| 1.0900514|
| 5.5276585|
|  3.122764|
| 2.3627946|
| 5.3667502|
| 1.3431213|
|  9.317738|
|  5.013023|
| 4.6022677|
| 1.1546545|
| 20.964258|
| 0.8852747|
|0.17227533|
+----------+
only showing top 20 rows



#### Crear dos columnas día de la semana y hora del viaje.

In [30]:
# funciones que me ayudarán en la transformación.
from datetime import datetime, date, time, timedelta
import calendar
def dia(dia):
    if dia == 1:
        return 'lunes'
    if dia == 2:
        return 'martes'
    if dia == 3:
        return 'miércoles'
    if dia == 4:
        return 'jueves'
    if dia == 5:
        return 'viernes'
    if dia == 6:
        return 'sábado'
    if dia == 7:
        return 'domingo'
    if dia < 1 or dia > 7:
        return 

from pyspark.sql import Row

def dia_semana(row):
    fecha , hora , utc = row.split(" ")
    formato_fecha = "%Y-%m-%d"
    dia_semana = datetime.isoweekday(datetime.strptime(fecha,formato_fecha))
    return dia_semana

def hora(row):
    fecha , hora , utc = row.split(" ")
    formato_hora = "%H:%M:%S"
    hora = datetime.strptime(hora,formato_hora).hour
    return hora
    

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
# convirtiendo las funciones en funciones UDF
udf_dia_semana= udf( lambda z : dia_semana(z))
udf_hora= udf( lambda z : hora(z))

In [31]:
from pyspark.sql.functions import col
dfspark_sample = dfspark_sample.withColumn('dia_semana', 
                                           udf_dia_semana(dfspark_sample['pickup_datetime'] )  )
dfspark_sample = dfspark_sample.withColumn('hora', 
                                           udf_hora(dfspark_sample['pickup_datetime'] )  )

In [32]:
# Castear dia de la semana y hora
dfspark_sample = dfspark_sample.withColumn("dia_semana",
                                           dfspark_sample["dia_semana"].cast("Integer"))
dfspark_sample = dfspark_sample.withColumn("hora",
                                           dfspark_sample["hora"].cast("Integer"))

In [33]:
dfspark_sample.printSchema()

root
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- dif_latitude: double (nullable = true)
 |-- dif_longitude: double (nullable = true)
 |-- distancia: float (nullable = true)
 |-- dia_semana: integer (nullable = true)
 |-- hora: integer (nullable = true)



##### Observación: En este punto la data estaria totalmente ok.

### Estadística de las nuevas variables

In [34]:
clean_summary=dfspark_sample.describe().toPandas()

In [35]:
columnas = ["summary",
            "pickup_longitude",
            "pickup_latitude",
            "dropoff_longitude",
            "dropoff_latitude",
            "passenger_count",
            "fare_amount",
            "dif_latitude",
            "dif_longitude",
            "dia_semana",
            "hora",
            "distancia"]
clean_summary[columnas]

Unnamed: 0,summary,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount,dif_latitude,dif_longitude,dia_semana,hora,distancia
0,count,2219173.0,2219173.0,2219173.0,2219173.0,2219173.0,2219173.0,2219173.0,2219173.0,2219173.0,2219173.0,2219173.0
1,mean,-72.49600158477304,39.91679455251933,-72.50229931133802,39.91968272022599,1.6848177226381178,11.353258637339264,0.0922919495213544,0.1617365203322726,4.042932660049487,13.512919452426647,19.38368660013694
2,stddev,10.463045755203852,6.112153680008393,10.439016533276371,6.104146217099324,1.308775977893193,42.45082368146176,1.699312422647234,3.198654331717892,1.9490730146054265,6.517990756130113,365.85874227507634
3,min,-121.9149932861328,-74.017222,-121.9151840209961,-74.177303,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,max,40.840962,74.007413,73.93996,74.95,7.0,61550.86,73.961092,89.483332,7.0,23.0,9952.922


### Correlación de los atributos

In [36]:
col_old=["pickup_longitude",
         "pickup_latitude",
         "dropoff_longitude",
         "dropoff_latitude",
         "passenger_count"]
col_new=["dif_latitude",
         "dif_longitude",
         "dia_semana",
         "hora"]
col_pred = ["fare_amount"]

In [37]:
# Transformación en un vector
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
inputCols = col_old+col_new+col_pred
assembler = VectorAssembler( inputCols=inputCols, outputCol="col_corr")
dfspark_corr = assembler.transform(dfspark_sample)

In [38]:
dfspark_corr=dfspark_corr.select('col_corr')

In [39]:
# estudiar la correlación
from pyspark.ml.stat import Correlation

dfspark_corr= Correlation.corr(dfspark_corr, 'col_corr','pearson')

In [40]:
from pyspark.ml.linalg import DenseMatrix, Vectors
#type(dfspark_cor.collect()[0][0]) # denseMatrix
# Pasamos la matrix como un array.
array_corr=dfspark_corr.collect()[0][0].toArray()

In [41]:
pdf_corr= pd.DataFrame(array_corr, columns=inputCols, index=inputCols)
mask = ~(pdf_corr>-0.3) | ~(pdf_corr<0.3)
round(pdf_corr,10).style.background_gradient()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dif_latitude,dif_longitude,dia_semana,hora,fare_amount
pickup_longitude,1.0,-0.979316,0.953046,-0.94027,0.000166,0.144987,0.154172,0.002333,-0.002262,0.002182
pickup_latitude,-0.979316,1.0,-0.940902,0.961188,-0.001066,-0.136213,-0.126502,-0.00378,0.002227,-0.002081
dropoff_longitude,0.953046,-0.940902,1.0,-0.980122,0.000169,0.133518,0.138657,0.0027,-0.002691,0.002457
dropoff_latitude,-0.94027,0.961188,-0.980122,1.0,-0.001481,-0.126487,-0.116139,-0.004007,0.002376,-0.00241
passenger_count,0.000166,-0.001066,0.000169,-0.001481,1.0,-0.002589,-0.001395,0.035591,0.017146,0.002861
dif_latitude,0.144987,-0.136213,0.133518,-0.126487,-0.002589,1.0,0.923415,-0.001876,-0.000871,0.007177
dif_longitude,0.154172,-0.126502,0.138657,-0.116139,-0.001395,0.923415,1.0,-0.002651,-0.00096,0.006947
dia_semana,0.002333,-0.00378,0.0027,-0.004007,0.035591,-0.001876,-0.002651,1.0,-0.087571,-1e-05
hora,-0.002262,0.002227,-0.002691,0.002376,0.017146,-0.000871,-0.00096,-0.087571,1.0,-0.004456
fare_amount,0.002182,-0.002081,0.002457,-0.00241,0.002861,0.007177,0.006947,-1e-05,-0.004456,1.0


### Observación:
###### La variable a predecir fare_amount itnee muy baja correlación con las demás variables. Su mayor correlación es con passenger_count y hora. Pero si observamos correlación entre las demás variables. Definitivamente no podemos utilizar un modelo lineal.

### Visualización de la data.

In [42]:
# Pasamos toda la data a Pandas
pandasData = dfspark_sample.toPandas()

In [None]:
#Manera alternativa de pasar la data a Pandas
# import pandas as pd
# from pyspark.sql import DataFrame

# # Wrapper for seamless Spark's serialisation
# def spark_to_pandas(spark_df: DataFrame) -> pd.DataFrame:
#     """
#     PySpark toPandas realisation using mapPartitions
#     much faster than vanilla version
#     fork: https://gist.github.com/lucidyan/1e5d9e490a101cdc1c2ed901568e082b
#     origin: https://gist.github.com/joshlk/871d58e01417478176e7
#     :param spark_df:
#     :return:
#     """
    
#     def _map_to_pandas(rdds) -> list:
#         """ Needs to be here due to pickling issues """
#         return [pd.DataFrame(list(rdds))]

#     def _to_pandas(df: DataFrame, n_partitions: int = None) -> pd.DataFrame:
#         """
#         Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
#         repartitioned if `n_partitions` is passed.
#         :param df:
#         :param n_partitions:
#         :return:
#         """
#         if n_partitions is not None:
#             df = df.repartition(n_partitions)
#         df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()  # type: pd.DataFrame
#         df_pand = pd.concat(df_pand)
#         df_pand.columns = df.columns
#         return df_pand

#     return _to_pandas(spark_df)

# pandasData = spark_to_pandas(dfspark_sample)

In [None]:
#Mostrando la data
display(pandasData)

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks" , color_codes = True)
var = ["dia_semana"]
# 1 inches = 96px
g = sns.pairplot(pandasData[:10], vars=var, diag_kind="hist", hue='dia_semana', height=4, aspect=4)

In [None]:
#Cuanto de la variable objetivo va variando segun los dias de la semana
import matplotlib.pyplot as plt

carac1=pandasData['dia_semana']
objet=pandasData['fare_amount']
plt.plot(carac1, objet, 'o')
plt.xlabel("Característica dia de la semana")
plt.ylabel("Objetivo")

In [None]:
#como los pasajeros se distribuyen a traves de las horas
pandasData.groupby('hora')['passenger_count'].sum().plot(kind='barh',legend='Reverse',figsize=(10,10))
plt.xlabel('')

In [None]:
#como se reparten los pasajeros en funcion de la hora
pandasData.passenger_count.groupby(pandasData.hora).sum().plot(kind='pie',cmap='Paired',figsize=(12,8))
plt.axis('equal')

In [None]:
#Dispersion de las características
caracteristicas=pandasData[['fare_amount','pickup_longitude','pickup_latitude','dropoff_longitude']]
sns.set(style="ticks", color_codes=True)
g= sns.pairplot(caracteristicas,hue='fare_amount',palette='Spectral')

In [None]:
g=sns.lmplot(x='passenger_count',y='hora',data=caracteristicas,palette='Set1')

In [None]:
# Relación entre dos caracteristicas vista como tendencia lineal
g=sns.lmplot(x='passenger_count',y='hora',hue='dia_semana',data=caracteristicas,palette='Set1')

### Exportamos  La Dta  en un formato CSV

In [43]:
#exportar la data a un formato csv
pandasData.to_csv("newDatapandas.csv",encoding = 'utf-8',index = False)

In [44]:
!pip install graphviz



## Regresion Lineal

In [1]:
#paquetes pára la creacion de arbol
#y manipulacion de archivos
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

#para poder graficar el arbol de decision
from sklearn.tree import export_graphviz
import graphviz
import matplotlib.pyplot as plt
import numpy as np


from sklearn.metrics import mean_squared_error as MSE

In [2]:
new_data =  pd.read_csv("newDatapandas.csv")

In [3]:
#verificamos la data
new_data

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dif_latitude,dif_longitude,distancia,dia_semana,hora
0,8.00,2015-03-12 23:14:59 UTC,-73.993141,40.727940,-73.996613,40.744530,2,0.016590,0.003471,1.868306,4,23
1,10.00,2013-08-21 08:38:06 UTC,-73.964837,40.769933,-73.983462,40.761655,1,0.008278,0.018625,1.819197,3,8
2,5.50,2014-01-23 18:40:00 UTC,-74.001017,40.746352,-73.990873,40.739497,1,0.006855,0.010144,1.145462,4,18
3,6.10,2011-12-24 14:03:24 UTC,-73.982433,40.768137,-73.989684,40.776138,1,0.008001,0.007251,1.079354,6,14
4,5.00,2012-10-14 23:24:00 UTC,-73.990358,40.740377,-74.000850,40.733955,1,0.006422,0.010492,1.136704,7,23
...,...,...,...,...,...,...,...,...,...,...,...,...
2219168,5.70,2009-11-16 10:36:05 UTC,-73.991241,40.744892,-73.977219,40.755458,1,0.010566,0.014022,1.666461,1,10
2219169,2.50,2014-02-26 19:44:39 UTC,-73.944647,40.751585,-73.944613,40.751624,1,0.000039,0.000034,0.005198,3,19
2219170,3.30,2012-01-06 19:33:00 UTC,-74.005272,40.727898,-74.000565,40.728908,1,0.001010,0.004707,0.412344,5,19
2219171,49.57,2010-12-13 19:11:00 UTC,-73.785532,40.648295,-73.980030,40.760702,5,0.112407,0.194498,20.622059,1,19


In [4]:
#regresion lineal simple

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from scipy.stats import pearsonr

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [5]:
#se entablece las variables predictoras y obejtivos
# se establece las columnas
columnas_float = ["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude",
                 "passenger_count","dif_latitude","dif_longitude","dia_semana", "hora"]
#extraccion de datos predictores
predictors = new_data[columnas_float]
#extraccion de dato objetivo
target = new_data["fare_amount"]

In [6]:
#Se indican las etiquetas de las variable predictoras y objetivos
predictors_label = ["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude",
                 "passenger_count","dif_latitude","dif_longitude","dia_semana", "hora"]

target_label =["fare_amount"]


In [7]:
#Crealas varibales para el entrenamiento del arbol y las varibales de prueba
#split data ito 80% train and 205 test
X_train, X_test, y_train, y_test = train_test_split(predictors,target,test_size =0.2,random_state =3)

In [8]:
modelo= LinearRegression()
modelo.fit(X_train, y_train)

LinearRegression()

In [9]:
# Información del modelo
# ==============================================================================
print("Intercept:", modelo.intercept_)
print("*"*20)
print("Coeficiente:", list(zip(predictors.columns, modelo.coef_.flatten(), )))
print("*"*20)
print("Coeficiente de determinación R^2:", modelo.score(predictors, target))


Intercept: 12.022867125515502
********************
Coeficiente: [('pickup_longitude', 0.01035106792875126), ('pickup_latitude', 0.045202582525908336), ('dropoff_longitude', -0.009432561889370139), ('dropoff_latitude', -0.05272154785493377), ('passenger_count', 0.091047889521833), ('dif_latitude', 0.13218122117422187), ('dif_longitude', 0.025501763534595438), ('dia_semana', -0.013999422034540106), ('hora', -0.03041801194768527)]
********************
Coeficiente de determinación R^2: 8.475222682480243e-05


In [10]:
#Una vez entrenado el modelo se envia la capacidad predictiva empleando el conjunto test


# Error de test del modelo 
# ==============================================================================
predicciones = modelo.predict(X_test)
print(predicciones[0:10,])

rmse = mean_squared_error(
        y_true  = y_test,
        y_pred  = predicciones,
        squared = False
       )
print("")
print(f"El error (rmse) de test es: {rmse}")

[11.28684855 11.51851139 11.24084948 11.82788421 11.0391551  11.42507307
 11.20018212 11.67110882 11.5474798  11.84089346]

El error (rmse) de test es: 9.765045399049507


### Regresion lineal multiple

In [11]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from scipy import stats

In [12]:
# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

### Ajuste del modelo

In [13]:
X_train = sm.add_constant(X_train, prepend=True)
modelo = sm.OLS(y_train,X_train,)
modelo = modelo.fit()
print(modelo.summary())

                            OLS Regression Results                            
Dep. Variable:            fare_amount   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     13.56
Date:                Wed, 09 Jun 2021   Prob (F-statistic):           5.01e-22
Time:                        13:26:05   Log-Likelihood:            -9.3622e+06
No. Observations:             1775338   AIC:                         1.872e+07
Df Residuals:                 1775328   BIC:                         1.872e+07
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                12.0229      0.28

El modelo con toadas las variables predictoras introducidas como predictores tiene R^2 bajo 0.001, que es capz de explicar solo el 0.1% de la variabilidad obeservada en el precio el p-value del modelo es (2.91e-254)


Acorde al p-value obtenido para coeficientes parcial de regresion de dia_semana es de (0.319), esta variable no contribuye de manera sigbificativa al modelo. procedemos a entrenar de nuevo pero excluyendo el predicto  dia_semana

In [14]:
X_train = X_train.drop(columns = 'dia_semana')
X_test  = X_test.drop(columns = 'dia_semana')

# A la matriz de predictores se le tiene que añadir una columna de 1s para el
# intercept del modelo
X_train = sm.add_constant(X_train, prepend=True)
modelo  = sm.OLS(endog=y_train, exog=X_train,)
modelo  = modelo.fit()
print(modelo.summary())

                            OLS Regression Results                            
Dep. Variable:            fare_amount   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     15.19
Date:                Wed, 09 Jun 2021   Prob (F-statistic):           1.63e-22
Time:                        13:26:09   Log-Likelihood:            -9.3622e+06
No. Observations:             1775338   AIC:                         1.872e+07
Df Residuals:                 1775329   BIC:                         1.872e+07
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                11.9623      0.27

### Intevalos de confianza de los coficientes 

In [15]:
intervalos_ci = modelo.conf_int(alpha=0.05)
intervalos_ci.columns = ['2.5%', '97.5%']
intervalos_ci

Unnamed: 0,2.5%,97.5%
const,11.424531,12.500056
pickup_longitude,-0.046799,0.067635
pickup_latitude,-0.061428,0.152043
dropoff_longitude,-0.067523,0.048684
dropoff_latitude,-0.161013,0.055663
passenger_count,0.037216,0.143334
dif_latitude,0.02279,0.241557
dif_longitude,-0.033056,0.0841
hora,-0.040704,-0.019394


### Diagnostico de los residuos

In [16]:
y_1_train =  y_train.to_numpy()
y_1_train = y_1_train.flatten()
prediccion_train = modelo.predict(exog = X_train)
residuos_train   = prediccion_train - y_1_train

In [None]:
# Gráficos
# ==============================================================================
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(9, 8))

axes[0, 0].scatter(y_1_train, prediccion_train, edgecolors=(0, 0, 0), alpha = 0.4)
axes[0, 0].plot([y_1_train.min(), y_1_train.max()], [y_1_train.min(), y_1_train.max()],
                'k--', color = 'black', lw=2)
axes[0, 0].set_title('Valor predicho vs valor real', fontsize = 10, fontweight = "bold")
axes[0, 0].set_xlabel('Real')
axes[0, 0].set_ylabel('Predicción')
axes[0, 0].tick_params(labelsize = 7)

axes[0, 1].scatter(list(range(len(y_1_train))), residuos_train,
                   edgecolors=(0, 0, 0), alpha = 0.4)
axes[0, 1].axhline(y = 0, linestyle = '--', color = 'black', lw=2)
axes[0, 1].set_title('Residuos del modelo', fontsize = 10, fontweight = "bold")
axes[0, 1].set_xlabel('id')
axes[0, 1].set_ylabel('Residuo')
axes[0, 1].tick_params(labelsize = 7)

sns.histplot(
    data    = residuos_train,
    stat    = "density",
    kde     = True,
    line_kws= {'linewidth': 1},
    color   = "firebrick",
    alpha   = 0.3,
    ax      = axes[1, 0]
)

axes[1, 0].set_title('Distribución residuos del modelo', fontsize = 10,
                     fontweight = "bold")
axes[1, 0].set_xlabel("Residuo")
axes[1, 0].tick_params(labelsize = 7)


sm.qqplot(
    residuos_train,
    fit   = True,
    line  = 'q',
    ax    = axes[1, 1], 
    color = 'firebrick',
    alpha = 0.4,
    lw    = 2
)
axes[1, 1].set_title('Q-Q residuos del modelo', fontsize = 10, fontweight = "bold")
axes[1, 1].tick_params(labelsize = 7)

axes[2, 0].scatter(prediccion_train, residuos_train,
                   edgecolors=(0, 0, 0), alpha = 0.4)
axes[2, 0].axhline(y = 0, linestyle = '--', color = 'black', lw=2)
axes[2, 0].set_title('Residuos del modelo vs predicción', fontsize = 10, fontweight = "bold")
axes[2, 0].set_xlabel('Predicción')
axes[2, 0].set_ylabel('Residuo')
axes[2, 0].tick_params(labelsize = 7)

# Se eliminan los axes vacíos
fig.delaxes(axes[2,1])

fig.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle('Diagnóstico residuos', fontsize = 12, fontweight = "bold");

Los residuos parecen distyribuirse de forma aleatoria en torno a cero  mantenindo aporximandamente la misma variabilidada lo largo del eje x
por lo tanto se pudria afirmar que tiene una distrbucion normal

### Test de Normalidad

Se puede comprobar si mlos residuos siguen una disctribucion normal empleando  dos test estadicticos: Shapiro-Wilk test y D'Agostino's K-squared test

In [None]:
# Normalidad de los residuos Shapiro-Wilk test
# ==============================================================================
shapiro_test = stats.shapiro(residuos_train)
shapiro_test


In [None]:
# Normalidad de los residuos D'Agostino's K-squared test
# ==============================================================================
k2, p_value = stats.normaltest(residuos_train)
print(f"Estadítico= {k2}, p-value = {p_value}")

Ambos datos muestran claras enviedia que los datos se distribuyen de manera normal p-value<<0.01

## Predicciones

Una vez entrenado el modelo, se  puede obtenewr predicciones para nuevos datos. los modelos de Stastmodels permiten calcular los intevalors de confianza de cada prediccion.


In [None]:
predicciones = modelo.get_prediction(X_train).summary_frame(alpha=0.05)
predicciones.head(4)

In [None]:
# Error de test del modelo 
# ==============================================================================
X_test = sm.add_constant(X_test, prepend=True)
predicciones = modelo.predict(exog = X_test)
rmse = mean_squared_error(
        y_true  = y_test,
        y_pred  = predicciones,
        squared = False
       )
print("")
print(f"El error (rmse) de test es: {rmse}")


### Interpretaciòn
El modelo de regresion lineal multiple :

    costo =12.11337 +pickup_longitude*0.011708 +pickup_latitude*0.0082 - dropoff_longitude*0.00457 -dropoff_latitude*0.01018+passenger_count*0.40222-dif_latitude*0.00337+dif_longitude*0.015372 + hora*0.02547

## KNN

In [None]:
# cerramos la sesión spark
sc.stop()