In [142]:
import pyspark
from pyspark.sql import SparkSession
app_name = "bubbly"
master = "local[*]"
spark = (SparkSession.builder
    .master(master)
    .config("spark.driver.cores", 1)
    .appName(app_name)
    .getOrCreate() )
sc = spark.sparkContext
print ('SparkContext created')

SparkContext created


In [143]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType

accidenteSchema = StructType ([
    StructField("Index", IntegerType(), True),
    StructField("FECHA", DateType(), True),
    StructField("RANGO HORARIO", StringType(), True),
    StructField("DIA SEMANA", StringType(), True),
    StructField("DISTRITO", StringType(), True),
    StructField("LUGAR ACCIDENTE", StringType(), True),
    StructField("Nº", StringType(), True),
    StructField("Nº PARTE", StringType(), True),
    StructField("CPFA Granizo", StringType(), True),
    StructField("CPFA Hielo", StringType(), True),
    StructField("CPFA Lluvia", StringType(), True),
    StructField("CPFA Niebla", StringType(), True),
    StructField("CPFA Seco", StringType(), True),
    StructField("CPFA Nieve", StringType(), True),
    StructField("CPSV Mojada", StringType(), True),
    StructField("CPSV Aceite", StringType(), True),
    StructField("CPSV Barro", StringType(), True),
    StructField("CPSV Grava Suelta", StringType(), True),
    StructField("CPSV Hielo", StringType(), True),
    StructField("CPSV Seca Y Limpia", StringType(), True),
    StructField("* Nº VICTIMAS", IntegerType(), True),
    StructField("TIPO ACCIDENTE", StringType(), True),
    StructField("Tipo Vehiculo", StringType(), True),
    StructField("TIPO PERSONA", StringType(), True),
    StructField("SEXO", StringType(), True),
    StructField("LESIVIDAD", StringType(), True),
    StructField("Tramo Edad", StringType(), True)])

accidenteData = spark.read.csv ('hdfs://localhost:9000/user/ubuntu/accidentes/2018.csv',header=True)#,schema=accidenteSchema)
#accidenteData = spark.read.csv ('file:///home/ubuntu/Downloads/2018_Accidentalidad.csv',header=True,sep=",",schema=accidenteSchema)
#hdfs://localhost:9000/users/ubuntu/accidentes/2018.csv

#accidenteData = spark.read.format("csv")\
#             .option("delimiter",",")\
#             .option("header", "true")\
#             .schema(accidenteSchema)\
#             .option("quote","")\
#             .load('file:///home/ubuntu/Downloads/2018_Accidentalidad.csv')


In [144]:
accidenteData.printSchema()
#accidenteData.show()

root
 |-- FECHA: string (nullable = true)
 |-- RANGO HORARIO: string (nullable = true)
 |-- DIA SEMANA: string (nullable = true)
 |-- DISTRITO: string (nullable = true)
 |-- LUGAR ACCIDENTE: string (nullable = true)
 |-- Nº: string (nullable = true)
 |-- Nº PARTE: string (nullable = true)
 |-- CPFA Granizo: string (nullable = true)
 |-- CPFA Hielo: string (nullable = true)
 |-- CPFA Lluvia: string (nullable = true)
 |-- CPFA Niebla: string (nullable = true)
 |-- CPFA Seco: string (nullable = true)
 |-- CPFA Nieve: string (nullable = true)
 |-- CPSV Mojada: string (nullable = true)
 |-- CPSV Aceite: string (nullable = true)
 |-- CPSV Barro: string (nullable = true)
 |-- CPSV Grava Suelta: string (nullable = true)
 |-- CPSV Hielo: string (nullable = true)
 |-- CPSV Seca Y Limpia: string (nullable = true)
 |-- * Nº VICTIMAS: string (nullable = true)
 |-- TIPO ACCIDENTE: string (nullable = true)
 |-- Tipo Vehiculo: string (nullable = true)
 |-- TIPO PERSONA: string (nullable = true)
 |-- S

In [173]:
import pyspark.sql.functions as func

accRed=accidenteData.select(
        func.substring(accidenteData.LESIVIDAD,1,2).alias('lesividad'), \
        accidenteData["DIA SEMANA"].alias('diaStr'), \
        accidenteData.DISTRITO.alias('distrito'), \
        accidenteData['* Nº VICTIMAS'].cast(IntegerType()).alias('victimas')
)

accPorDia=accRed.filter(accRed.lesividad != "IL")\
                .groupBy('diaStr','distrito')\
                .agg(func.sum('victimas')\
                .alias('TotalVictimas'))

In [174]:
accPorDia = accPorDia.withColumn("NumDia",func.when(accPorDia.diaStr == 'LUNES',1)
                                                 .when(accPorDia.diaStr == 'MARTES',2)
                                                 .when(accPorDia.diaStr == 'MIERCOLES',3)
                                                 .when(accPorDia.diaStr == 'JUEVES',4)
                                                 .when(accPorDia.diaStr == 'VIERNES',5)
                                                 .when(accPorDia.diaStr == 'SABADO',6)
                                                 .otherwise(7)).orderBy('NumDia')


In [175]:
#accPorDiaNum = accPorDia.withColumn("NumDia",diasSemana[accPorDia['DIA SEMANA']])
accPorDia.printSchema()
accPorDia.show()


root
 |-- diaStr: string (nullable = true)
 |-- distrito: string (nullable = true)
 |-- TotalVictimas: long (nullable = true)
 |-- NumDia: integer (nullable = false)

+------+--------------------+-------------+------+
|diaStr|            distrito|TotalVictimas|NumDia|
+------+--------------------+-------------+------+
| LUNES|SAN BLAS         ...|            1|     1|
| LUNES|USERA            ...|            1|     1|
| LUNES|CHAMARTIN        ...|            1|     1|
| LUNES|CENTRO           ...|            4|     1|
| LUNES|MORATALAZ        ...|            1|     1|
| LUNES|FUENCARRAL-EL PAR...|            4|     1|
| LUNES|CIUDAD LINEAL    ...|            9|     1|
| LUNES|SALAMANCA        ...|            4|     1|
| LUNES|ARGANZUELA       ...|            1|     1|
| LUNES|HORTALEZA        ...|            1|     1|
| LUNES|CARABANCHEL      ...|            1|     1|
| LUNES|RETIRO           ...|            4|     1|
|MARTES|CHAMARTIN        ...|            1|     2|
|MARTES|MORATALAZ

In [None]:
from plotty.offline import plot
from bubbly.bubbly import bubbleplot 
import pandas as pd

figure = bubbleplot(dataset=accPordia, x_column='diaStr', y_column='', 
    bubble_column='country', time_column='year', size_column='pop', color_column='continent', 
    x_title="GDP per Capita", y_title="Life Expectancy", title='Gapminder Global Indicators',
    x_logscale=True, scale_bubble=3, height=650)

#iplot(figure, config={'scrollzoom': True})
iplot(figure)

In [68]:
#accPorDiaNum.write.csv('hdfs://localhost:9000/user/ubuntu/accidentes/NumVic_DiaSemana.csv')
accPorDiaNum.write.csv('file:///home/ubuntu/Downloads/NumAcc_DiaSemana')

In [171]:
accPorDiaNum.toPandas().to_csv('/home/ubuntu/Downloads/acc_diaSemana.csv',sep=';')