In [1]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/04 04:48:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df2 = spark.read.csv('raw_data.csv', header=True)
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType,IntegerType

# Change the data type using the withColumn method
df2 = df2.withColumn("cantidad", col("cantidad").cast(IntegerType()))
columns_to_keep = df2.columns[1:]  
df2= df2.select(*columns_to_keep)
df2.printSchema()

root
 |-- departamento: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- codigo_dane: string (nullable = true)
 |-- armas_medios: string (nullable = true)
 |-- fecha_hecho: string (nullable = true)
 |-- genero: string (nullable = true)
 |-- grupo_etario: string (nullable = true)
 |-- cantidad: integer (nullable = true)



In [3]:
columns_to_drop = ['departamento', 'municipio', 'codigo_dane', 'grupo_etario']
df2 = df2.select([col for col in df2.columns if col not in columns_to_drop])
df2.show()

+--------------------+-----------+---------+--------+
|        armas_medios|fecha_hecho|   genero|cantidad|
+--------------------+-----------+---------+--------+
|ARMA BLANCA / COR...|  1/01/2010|MASCULINO|       1|
|ARMA BLANCA / COR...|  1/01/2010| FEMENINO|       1|
|ARMA BLANCA / COR...|  1/01/2010|MASCULINO|       1|
|ARMA BLANCA / COR...|  1/01/2010| FEMENINO|       1|
|ARMA BLANCA / COR...|  1/01/2010| FEMENINO|       1|
|ARMA BLANCA / COR...|  1/01/2010| FEMENINO|       1|
|ARMA BLANCA / COR...|  1/01/2010| FEMENINO|       2|
|ARMA BLANCA / COR...|  1/01/2010|MASCULINO|       2|
|ARMA BLANCA / COR...|  1/01/2010| FEMENINO|       1|
|       ARMA DE FUEGO|  1/01/2010| FEMENINO|       1|
|        CONTUNDENTES|  1/01/2010| FEMENINO|       1|
|        CONTUNDENTES|  1/01/2010| FEMENINO|       2|
|        CONTUNDENTES|  1/01/2010| FEMENINO|       1|
|        CONTUNDENTES|  1/01/2010| FEMENINO|       3|
|        CONTUNDENTES|  1/01/2010|MASCULINO|       1|
|        CONTUNDENTES|  1/01

In [4]:
from pyspark.sql.functions import col, year, date_format
from pyspark.sql.types import DateType
from pyspark.sql.functions import to_date

df3 = df2.withColumn('fecha_hecho', to_date(col('fecha_hecho'), 'd/MM/yyyy'))
# Remove rows with a date less than 2015
df3_filtered = df3.filter((year('fecha_hecho') >= 2021))

df3_filtered = df3_filtered.withColumn('fecha_hecho_STR', date_format('fecha_hecho', 'yyyy-MM-dd HH:mm:ss'))
df3_filtered.select('fecha_hecho_STR').describe().show(truncate=False)



+-------+-------------------+
|summary|fecha_hecho_STR    |
+-------+-------------------+
|count  |121175             |
|mean   |null               |
|stddev |null               |
|min    |2021-01-01 00:00:00|
|max    |2023-02-28 00:00:00|
+-------+-------------------+



                                                                                

In [5]:
unique_values_gen = df3.select('genero').distinct().rdd.flatMap(lambda x: x).collect()
unique_values_armas = df3.select('armas_medios').distinct().rdd.flatMap(lambda x: x).collect()

print(" 'GENERO' column:", unique_values_gen)
print(" 'ARMAS MEDIOS' column:", unique_values_armas)

[Stage 8:>                                                          (0 + 2) / 2]

 'GENERO' column: ['FEMENINO', 'MASCULINO', 'NO REPORTA', '-', 'NO REPORTADO']
 'ARMAS MEDIOS' column: ['ARMA DE FUEGO', 'ESCOPOLAMINA', 'SIN EMPLEO DE ARMAS', 'ARMA BLANCA / CORTOPUNZANTE', 'NO REPORTADO', 'CONTUNDENTES', 'CORTANTES', 'CORTOPUNZANTES', 'NO REPORTA', 'PUNZANTES', '-']


                                                                                

In [6]:
values_to_remove = ['-', 'NO REPORTADO']
values_to_remove1 = ['-']
df5 = df3_filtered.filter(~col('genero').isin(values_to_remove) & 
                          ~col('armas_medios').isin(values_to_remove1) & 
                          (col('cantidad') < 20))

has_null_values = df5.filter(df5['genero'].isNull() | df5['armas_medios'].isNull() | df5['cantidad'].isNull())

if has_null_values.count() > 0:
    print("Null value exists in the DataFrame")
else:
    print("No null values in the DataFrame")

[Stage 11:>                                                         (0 + 2) / 2]

No null values in the DataFrame




In [7]:
df5.describe().show()



+-------+--------------------+---------+------------------+-------------------+
|summary|        armas_medios|   genero|          cantidad|    fecha_hecho_STR|
+-------+--------------------+---------+------------------+-------------------+
|  count|              119495|   119495|            119495|             119495|
|   mean|                null|     null|1.6663877149671535|               null|
| stddev|                null|     null|1.7632847848850663|               null|
|    min|ARMA BLANCA / COR...| FEMENINO|                 1|2021-01-01 00:00:00|
|    max| SIN EMPLEO DE ARMAS|MASCULINO|                19|2023-02-28 00:00:00|
+-------+--------------------+---------+------------------+-------------------+



                                                                                

In [8]:
unique_values_gen = df5.select('genero').distinct().rdd.flatMap(lambda x: x).collect()
unique_values_armas = df5.select('armas_medios').distinct().rdd.flatMap(lambda x: x).collect()

print(" 'GENERO' column:", unique_values_gen)
print(" 'ARMAS MEDIOS' column:", unique_values_armas)

[Stage 20:>                                                         (0 + 2) / 2]

 'GENERO' column: ['FEMENINO', 'MASCULINO']
 'ARMAS MEDIOS' column: ['ARMA DE FUEGO', 'SIN EMPLEO DE ARMAS', 'ARMA BLANCA / CORTOPUNZANTE', 'NO REPORTADO', 'CONTUNDENTES']




In [10]:
df5.coalesce(1).write.csv('dataset2021-2023-1', header=True)

                                                                                

In [10]:
from pyspark.sql.functions import col, count

grouped_data = df5.groupBy('fecha_hecho', 'cantidad', 'genero', 'armas_medios') \
       .agg(count("*").alias("Number of domestic violence"))

sorted_data = grouped_data.orderBy(col('fecha_hecho'))
sorted_data.show()



+-----------+--------+---------+--------------------+---------------------------+
|fecha_hecho|cantidad|   genero|        armas_medios|Number of domestic violence|
+-----------+--------+---------+--------------------+---------------------------+
| 2021-01-01|       7|MASCULINO|        CONTUNDENTES|                          1|
| 2021-01-01|       2| FEMENINO| SIN EMPLEO DE ARMAS|                         17|
| 2021-01-01|       5|MASCULINO| SIN EMPLEO DE ARMAS|                          2|
| 2021-01-01|       2|MASCULINO|ARMA BLANCA / COR...|                          2|
| 2021-01-01|       7| FEMENINO|        CONTUNDENTES|                          1|
| 2021-01-01|       1| FEMENINO|        CONTUNDENTES|                         65|
| 2021-01-01|       1| FEMENINO| SIN EMPLEO DE ARMAS|                         70|
| 2021-01-01|      12| FEMENINO| SIN EMPLEO DE ARMAS|                          1|
| 2021-01-01|       2|MASCULINO|        CONTUNDENTES|                          5|
| 2021-01-01|   

                                                                                

In [11]:
sorted_data.write.csv('dataset2021-2023', header=True)

                                                                                