## Configurando o ambiente do Colab para o Pyspark e para usar arquivos armazenados no Google Drive

In [None]:
!pip install pyspark

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("monkeypoxcolab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
spark

In [None]:
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.sql.functions import *
from IPython.core.display import HTML

In [None]:
!pip install -U plotly

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

### Carregando dataset em csv e renomeando colunas

In [None]:
monkeypoxdf = spark.read.csv("/content/drive/MyDrive/Datasets/latest.csv", sep=",", header=True, inferSchema=True)

In [None]:
monkeypoxdf.printSchema()

In [None]:
newmonkeypoxdf1 = monkeypoxdf.withColumnRenamed("Location", "Localizacao") \
              .withColumnRenamed("City", "Cidade") \
              .withColumnRenamed("Country", "Pais") \
              .withColumnRenamed("Country_ISO3", "Cod_ISO3") \
              .withColumnRenamed("Age", "Idade") \
              .withColumnRenamed("Gender", "Sexo") \
              .withColumnRenamed("Date_onset", "Data_entrada_sistema") \
              .withColumnRenamed("Date_confirmation", "Data_confirmacao") \
              .withColumnRenamed("Symptoms", "Sintomas") \
              .withColumnRenamed("Hospitalised (Y/N/NA)", "Hospitalizado") \
              .withColumnRenamed("Date_hospitalisation", "Data_internacao") \
              .withColumnRenamed("Isolated (Y/N/NA)", "Em_isolamento") \
              .withColumnRenamed("Date_isolation", "Data_isolamento") \
              .withColumnRenamed("Outcome", "Detectado") \
              .withColumnRenamed("Contact_comment", "Comentarios_contato") \
              .withColumnRenamed("Contact_ID", "Identidade_Contato") \
              .withColumnRenamed("Contact_location", "Cidade_contato") \
              .withColumnRenamed("Travel_history (Y/N/NA)", "Viajou") \
              .withColumnRenamed("Travel_history_entry", "Data_viagem") \
              .withColumnRenamed("Travel_history_start", "Data_inicio_viagem") \
              .withColumnRenamed("Travel_history_location", "Localidade_visitada") \
              .withColumnRenamed("Travel_history_country", "Pais_visitado") \
              .withColumnRenamed("Genomics_Metadata", "Genoma_virus") \
              .withColumnRenamed("Confirmation_method", "Metodo_confirmacao") \
              .withColumnRenamed("Source", "Fonte") \
              .withColumnRenamed("Source_II", "Fonte_II") \
              .withColumnRenamed("Source_III", "Fonte_III") \
              .withColumnRenamed("Source_IV", "Fonte_IV") \
              .withColumnRenamed("Source_V", "Fonte_V") \
              .withColumnRenamed("Source_VI", "Fonte_VI") \
              .withColumnRenamed("Source_VII", "Fonte_VII") \
              .withColumnRenamed("Date_entry", "Data_registro") \
              .withColumnRenamed("Date_death", "Data_obito") \
              .withColumnRenamed("Date_last_modified", "Data_ultima_atualizacao")
newmonkeypoxdf1.printSchema()

In [None]:
newmonkeypoxdf1.show()

### **ANÁLISE EXPLORATÓRIA E EXPLANATÓRIA DO PESO DO VAZIO NO DATASET SOBRE A EPIDEMIA DE VARÍOLA DOS MACACOS**

#### Contagem de valores vazios em cada coluna do dataframe

In [None]:
from pyspark.sql.functions import col, isnan,when, count
newmonkeypoxdf_Null=["Status", "Localizacao", "Cidade", "Pais", "Cod_ISO3","Idade", "Sexo", "Sintomas","Hospitalizado","Em_isolamento","Detectado","Comentarios_contato","Identidade_Contato","Cidade_contato", "Viajou", "Data_viagem","Data_inicio_viagem","Localidade_visitada","Pais_visitado","Genoma_virus", "Metodo_confirmacao", "Fonte", "Fonte_II", "Fonte_III", "Fonte_IV", "Fonte_V", "Fonte_VI", "Fonte_VII"]
newmonkeypoxdf1.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in newmonkeypoxdf_Null]).show()

##### Plotagem em gráfico de linha de Dataframe com contagem de valores nulos de cada coluna

In [None]:
px.line(x=["Status","Localizacao","Cidade","Pais","Cod_ISO3","Idade","Sexo","Sintomas","Hospitalizado","Em_isolamento","Detectado","Comentarios_contato","Identidade_Contato","Cidade_Contato","Viajou","Data_viagem","Data_inicio_viagem","Localidade_visitada","Pais_visitado","Genoma_virus", "Metodo_confirmacao", "Fonte", "Fonte_II", "Fonte_III", "Fonte_IV", "Fonte_V", "Fonte_VI", "Fonte_VII"], y=[44,13332,48016,44,44,46542,47057,49086,48986,48864,49199,49198,49262,49283,48946,49252,49279,49182,49194,49265,49190,44,42323,48451,49236,49289,49289,49289])

In [None]:
newmonkeypoxdf1.createOrReplaceTempView("NewMonkeyPox1")
spark.sql("select Status, Pais, CASE WHEN Sexo = 'Female' THEN 'female' " +
          "WHEN Sexo = 'female ' THEN 'female' " + 
          "WHEN Sexo = 'Male' THEN 'male' " +
          "WHEN Sexo = 'male ' THEN 'male' WHEN Sexo IS NULL THEN '' " +
          "ELSE Sexo END as novo_sexo from NewMonkeyPox1").show(10)

In [None]:
spark.sql('select * from NewMonkeyPox1 where isNULL(Sexo)').count()

In [None]:
newmonkeypoxdf1.count()

In [None]:
spark.sql('select * from NewMonkeyPox1 where isNULL(Sexo)').show(30)