In [1]:
import pyspark
from pyspark.sql import SparkSession
app_name = "bubbly"
master = "local[*]"
spark = (SparkSession.builder
    .master(master)
    .config("spark.driver.cores", 1)
    .appName(app_name)
    .getOrCreate() )
sc = spark.sparkContext
print ('SparkContext created')

SparkContext created


In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType

accidenteSchema = StructType ([
    StructField("Index", IntegerType(), True),
    StructField("FECHA", DateType(), True),
    StructField("RANGO HORARIO", StringType(), True),
    StructField("DIA SEMANA", StringType(), True),
    StructField("DISTRITO", StringType(), True),
    StructField("LUGAR ACCIDENTE", StringType(), True),
    StructField("Nº", StringType(), True),
    StructField("Nº PARTE", StringType(), True),
    StructField("CPFA Granizo", StringType(), True),
    StructField("CPFA Hielo", StringType(), True),
    StructField("CPFA Lluvia", StringType(), True),
    StructField("CPFA Niebla", StringType(), True),
    StructField("CPFA Seco", StringType(), True),
    StructField("CPFA Nieve", StringType(), True),
    StructField("CPSV Mojada", StringType(), True),
    StructField("CPSV Aceite", StringType(), True),
    StructField("CPSV Barro", StringType(), True),
    StructField("CPSV Grava Suelta", StringType(), True),
    StructField("CPSV Hielo", StringType(), True),
    StructField("CPSV Seca Y Limpia", StringType(), True),
    StructField("* Nº VICTIMAS", IntegerType(), True),
    StructField("TIPO ACCIDENTE", StringType(), True),
    StructField("Tipo Vehiculo", StringType(), True),
    StructField("TIPO PERSONA", StringType(), True),
    StructField("SEXO", StringType(), True),
    StructField("LESIVIDAD", StringType(), True),
    StructField("Tramo Edad", StringType(), True)])

#accidenteData = spark.read.csv ('hdfs://localhost:9000/user/ubuntu/accidentes/2018.csv',header=True)#,schema=accidenteSchema)
#accidenteData = spark.read.csv ('file:///home/jose/tmp/2018.csv',header=True,sep=",",schema=accidenteSchema)
accidenteData = spark.read.csv ('file:///home/ubuntu/Downloads/2018_Accidentalidad.csv',header=True,sep=",",schema=accidenteSchema)
#hdfs://localhost:9000/users/ubuntu/accidentes/2018.csv

#accidenteData = spark.read.format("csv")\
#             .option("delimiter",",")\
#             .option("header", "true")\
#             .schema(accidenteSchema)\
#             .option("quote","")\
#             .load('file:///home/ubuntu/Downloads/2018_Accidentalidad.csv')


In [3]:
accidenteData.printSchema()
#accidenteData.show()

root
 |-- Index: integer (nullable = true)
 |-- FECHA: date (nullable = true)
 |-- RANGO HORARIO: string (nullable = true)
 |-- DIA SEMANA: string (nullable = true)
 |-- DISTRITO: string (nullable = true)
 |-- LUGAR ACCIDENTE: string (nullable = true)
 |-- Nº: string (nullable = true)
 |-- Nº PARTE: string (nullable = true)
 |-- CPFA Granizo: string (nullable = true)
 |-- CPFA Hielo: string (nullable = true)
 |-- CPFA Lluvia: string (nullable = true)
 |-- CPFA Niebla: string (nullable = true)
 |-- CPFA Seco: string (nullable = true)
 |-- CPFA Nieve: string (nullable = true)
 |-- CPSV Mojada: string (nullable = true)
 |-- CPSV Aceite: string (nullable = true)
 |-- CPSV Barro: string (nullable = true)
 |-- CPSV Grava Suelta: string (nullable = true)
 |-- CPSV Hielo: string (nullable = true)
 |-- CPSV Seca Y Limpia: string (nullable = true)
 |-- * Nº VICTIMAS: integer (nullable = true)
 |-- TIPO ACCIDENTE: string (nullable = true)
 |-- Tipo Vehiculo: string (nullable = true)
 |-- TIPO PER

In [4]:
import pyspark.sql.functions as func

accRed=accidenteData.select(
        func.substring(accidenteData.LESIVIDAD,1,2).alias('lesividad'), \
        accidenteData["DIA SEMANA"].alias('diaStr'), \
        accidenteData["SEXO"].alias('sexo'), \
        accidenteData["RANGO HORARIO"].alias('tramoHora'), \
        accidenteData["Tramo Edad"].alias('tramoEdad'), \
        accidenteData.DISTRITO.alias('distrito'), \
        accidenteData['* Nº VICTIMAS'].cast(IntegerType()).alias('victimas')
)

accPorDia=accRed.filter(accRed.lesividad != "IL")\
                .groupBy('tramoEdad', 'diaStr','tramoHora','sexo')\
                .agg(func.sum('victimas')\
                .alias('TotalVictimas'))

In [5]:
accPorDia = accPorDia.withColumn("NumDia",func.when(accPorDia.diaStr == 'LUNES',1)
                                                 .when(accPorDia.diaStr == 'MARTES',2)
                                                 .when(accPorDia.diaStr == 'MIERCOLES',3)
                                                 .when(accPorDia.diaStr == 'JUEVES',4)
                                                 .when(accPorDia.diaStr == 'VIERNES',5)
                                                 .when(accPorDia.diaStr == 'SABADO',6)
                                                 .otherwise(7)).orderBy('NumDia')


In [6]:
#accPorDiaNum = accPorDia.withColumn("NumDia",diasSemana[accPorDia['DIA SEMANA']])
accPorDia.printSchema()
#accPorDia.show()


root
 |-- tramoEdad: string (nullable = true)
 |-- diaStr: string (nullable = true)
 |-- tramoHora: string (nullable = true)
 |-- sexo: string (nullable = true)
 |-- TotalVictimas: long (nullable = true)
 |-- NumDia: integer (nullable = false)



In [7]:
import pandas as pd
pdAccPorDia = accPorDia.toPandas()

dias = {
    'LUNES':1,
    'MARTES':2,
    'MIERCOLES':3,
    'JUEVES':4,
    'VIERNES':5,
    'SABADO':6,
    'DOMINGO':7
}

In [26]:
i=1
for row in pdAccPorDia:
    print (row)
    

tramoEdad
diaStr
tramoHora
sexo
TotalVictimas
NumDia


In [8]:
pdAccPorDia['dia'] = pdAccPorDia.apply(lambda row: dias[row.diaStr], axis=1)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
horas={
    'DE 00:00 A 00:59':0,
    'DE 1:00 A 1:59':1,
    'DE 2:00 A 2:59':2,
    'DE 3:00 A 3:59':3,
    'DE 4:00 A 4:59':4,
    'DE 5:00 A 5:59':5,
    'DE 6:00 A 6:59':6,
    'DE 7:00 A 7:59':7,
    'DE 8:00 A 8:59':8,
    'DE 9:00 A 9:59':9,
    'DE 10:00 A 10:59':10,
    'DE 11:00 A 11:59':11,
    'DE 12:00 A 12:59':12,
    'DE 13:00 A 13:59':13,
    'DE 14:00 A 14:59':14,
    'DE 15:00 A 15:59':15,
    'DE 16:00 A 16:59':16,
    'DE 17:00 A 17:59':17,
    'DE 18:00 A 18:59':18,
    'DE 19:00 A 19:59':19,
    'DE 20:00 A 20:59':20,
    'DE 21:00 A 21:59':21,
    'DE 22:00 A 22:59':22,
    'DE 23:00 A 23:59':23,
}
pdAccPorDia['hora'] = pdAccPorDia.apply(lambda row: horas[row.tramoHora], axis=1)

In [None]:
edades={
    "DE 0 A 5 AÑOS":5,
    "DE 6 A 9 AÑOS ":9,
    "DE 10 A 14 AÑOS":14,
    "DE 15 A 17 AÑOS":17,
    "DE 18 A 20 AÑOS":20,
    "DE 21 A 24 AÑOS":24,
    "DE 25 A 29 AÑOS":29,
    "DE 30 A 34 ANOS":34,
    "DE 35 A 39 AÑOS":39,
    "DE 40 A 44 AÑOS":44,
    "DE 45 A 49 AÑOS":49,
    "DE 50 A 54 AÑOS":54,
    "DE 55 A 59 AÑOS":59,
    "DE 60 A 64 AÑOS":64,
    "DE 65 A 69 AÑOS":69,
    "DE 70 A 74 AÑOS":74,
    "DE MAS DE 74 AÑOS ":99 ,
    "DESCONOCIDA":0
}
pdAccPorDia['edad'] = pdAccPorDia.apply(lambda row: edades[row.tramoEdad], axis=1)
pdAccPorDia = pdAccPorDia.sort_values (by=['edad','dia','hora'])

In [None]:
pdAccPorDia.head(8)

In [None]:
from plotly.offline import iplot, plot
from bubbly.bubbly import bubbleplot 


figure = bubbleplot(dataset=pdAccPorDia, x_column='hora', y_column='dia', 
    bubble_column='sexo', time_column = 'edad', size_column='TotalVictimas', #color_column='sexo', 
    x_title="Hora del día", y_title="Día de la semana", title='Accidentes ciudad de Madrid',
    scale_bubble=3, height=650)
#iplot(figure, config={'scrollzoom': True}) 
plot(figure)