# Importação de pacotes

In [1]:
from pyspark.sql import Row,DataFrame
from pyspark.sql.types import StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, expr, lit,substring,concat, concat_ws, when, coalesce
from pyspark.sql import functions as functions
from functools import reduce

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [2]:
spark

# Leitura dos dados

In [3]:
dados_path = './banklist.csv'

dados = spark.read.csv(dados_path, sep=',', inferSchema=True, header=True)

In [4]:
print(f'Length: {dados.count()}')
print(f'N cols: {len(dados.columns)}')
print(f'Columns: {dados.columns}')

Length: 553
N cols: 7
Columns: ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date', 'Updated Date']


In [5]:
dados.show(5)

+--------------------+------------------+---+-----+---------------------+------------+------------+
|           Bank Name|              City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|
+--------------------+------------------+---+-----+---------------------+------------+------------+
| Fayette County Bank|        Saint Elmo| IL| 1802| United Fidelity B...|   26-May-17|   26-Jul-17|
|Guaranty Bank, (d...|         Milwaukee| WI|30003| First-Citizens Ba...|    5-May-17|   26-Jul-17|
|      First NBC Bank|       New Orleans| LA|58302|         Whitney Bank|   28-Apr-17|   26-Jul-17|
|       Proficio Bank|Cottonwood Heights| UT|35495|    Cache Valley Bank|    3-Mar-17|   18-May-17|
|Seaway Bank and T...|           Chicago| IL|19328|  State Bank of Texas|   27-Jan-17|   18-May-17|
+--------------------+------------------+---+-----+---------------------+------------+------------+
only showing top 5 rows



# SQL + Spark

In [6]:
dados.createOrReplaceTempView('banklist')

df_check = spark.sql('''select `Bank Name`, City, `Closing Date` from banklist''')
df_check.show()

+--------------------+------------------+------------+
|           Bank Name|              City|Closing Date|
+--------------------+------------------+------------+
| Fayette County Bank|        Saint Elmo|   26-May-17|
|Guaranty Bank, (d...|         Milwaukee|    5-May-17|
|      First NBC Bank|       New Orleans|   28-Apr-17|
|       Proficio Bank|Cottonwood Heights|    3-Mar-17|
|Seaway Bank and T...|           Chicago|   27-Jan-17|
|Harvest Community...|        Pennsville|   13-Jan-17|
|         Allied Bank|          Mulberry|   23-Sep-16|
|The Woodbury Bank...|          Woodbury|   19-Aug-16|
|First CornerStone...|   King of Prussia|    6-May-16|
|  Trust Company Bank|           Memphis|   29-Apr-16|
|North Milwaukee S...|         Milwaukee|   11-Mar-16|
|Hometown National...|          Longview|    2-Oct-15|
| The Bank of Georgia|    Peachtree City|    2-Oct-15|
|        Premier Bank|            Denver|   10-Jul-15|
|      Edgebrook Bank|           Chicago|    8-May-15|
|         

# Operações básicas

In [7]:
dados.describe().show()

+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+
|summary|           Bank Name|   City|  ST|             CERT|Acquiring Institution|Closing Date|Updated Date|
+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+
|  count|                 553|    553| 553|              553|                  553|         553|         553|
|   mean|                null|   null|null|31729.65280289331|                 null|        null|        null|
| stddev|                null|   null|null|16420.59489355429|                 null|        null|        null|
|    min|1st American Stat...|Acworth|  AL|               91|      1st United Bank|    1-Aug-08|    1-Aug-13|
|    max|               ebank|Wyoming|  WY|            58701|  Your Community Bank|    9-Sep-11|    9-Sep-12|
+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+



In [8]:
dados.describe('CERT','ST').show()

+-------+-----------------+----+
|summary|             CERT|  ST|
+-------+-----------------+----+
|  count|              553| 553|
|   mean|31729.65280289331|null|
| stddev|16420.59489355429|null|
|    min|               91|  AL|
|    max|            58701|  WY|
+-------+-----------------+----+



In [9]:
print(dados.dtypes)
print(dados.printSchema())

[('Bank Name', 'string'), ('City', 'string'), ('ST', 'string'), ('CERT', 'int'), ('Acquiring Institution', 'string'), ('Closing Date', 'string'), ('Updated Date', 'string')]
root
 |-- Bank Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- ST: string (nullable = true)
 |-- CERT: integer (nullable = true)
 |-- Acquiring Institution: string (nullable = true)
 |-- Closing Date: string (nullable = true)
 |-- Updated Date: string (nullable = true)

None


# Remover duplicados

In [10]:
dados = dados.dropDuplicates()

print(f'Length: {dados.count()}')
print(f'N cols: {len(dados.columns)}')

Length: 553
N cols: 7


Não há dados duplicados

# Seleção de colunas

In [11]:
select = dados.select('Bank Name', 'CERT')
select.show(5)

+--------------------+-----+
|           Bank Name| CERT|
+--------------------+-----+
|              InBank|20203|
|       Bank of Alamo| 9961|
|First Community B...|34943|
|The National Repu...|  916|
|           NOVA Bank|27148|
+--------------------+-----+
only showing top 5 rows



In [12]:
print(f'Length: {select.count()}')
print(f'N cols: {len(select.columns)}')

Length: 553
N cols: 2


# Renomear colunas

In [31]:
renamed = dados \
    .withColumnRenamed('Bank Name', 'bank_name')\
    .withColumnRenamed('Acquiring Institution', 'acq_institution')\
    .withColumnRenamed('Closing Date', 'closing_date')\
    .withColumnRenamed('Updated Date', 'updated_date')\
    .withColumnRenamed('ST', 'state')\
    .withColumnRenamed('CERT', 'cert')\
    .withColumnRenamed('City', 'city')

In [32]:
renamed.show(5)

+--------------------+----------+-----+-----+--------------------+------------+------------+
|           bank_name|      city|state| cert|     acq_institution|closing_date|updated_date|
+--------------------+----------+-----+-----+--------------------+------------+------------+
|              InBank|Oak Forest|   IL|20203|MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|       Bank of Alamo|     Alamo|   TN| 9961|         No Acquirer|    8-Nov-02|   18-Mar-05|
|First Community B...|Fort Myers|   FL|34943|             C1 Bank|    2-Aug-13|    9-Feb-17|
|The National Repu...|   Chicago|   IL|  916| State Bank of Texas|   24-Oct-14|    6-Jan-16|
|           NOVA Bank|    Berwyn|   PA|27148|         No Acquirer|   26-Oct-12|   24-Jan-13|
+--------------------+----------+-----+-----+--------------------+------------+------------+
only showing top 5 rows



# Add colunas

In [33]:
add_column = dados.withColumn('state', col('ST'))
add_column.show(5)

+--------------------+----------+---+-----+---------------------+------------+------------+-----+
|           Bank Name|      City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|state|
+--------------------+----------+---+-----+---------------------+------------+------------+-----+
|              InBank|Oak Forest| IL|20203| MB Financial Bank...|    4-Sep-09|   17-Oct-15|   IL|
|       Bank of Alamo|     Alamo| TN| 9961|          No Acquirer|    8-Nov-02|   18-Mar-05|   TN|
|First Community B...|Fort Myers| FL|34943|              C1 Bank|    2-Aug-13|    9-Feb-17|   FL|
|The National Repu...|   Chicago| IL|  916|  State Bank of Texas|   24-Oct-14|    6-Jan-16|   IL|
|           NOVA Bank|    Berwyn| PA|27148|          No Acquirer|   26-Oct-12|   24-Jan-13|   PA|
+--------------------+----------+---+-----+---------------------+------------+------------+-----+
only showing top 5 rows



# Add constant column

In [34]:
add_column = add_column.withColumn('country', lit('US'))
add_column.show(5)

+--------------------+----------+---+-----+---------------------+------------+------------+-----+-------+
|           Bank Name|      City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|state|country|
+--------------------+----------+---+-----+---------------------+------------+------------+-----+-------+
|              InBank|Oak Forest| IL|20203| MB Financial Bank...|    4-Sep-09|   17-Oct-15|   IL|     US|
|       Bank of Alamo|     Alamo| TN| 9961|          No Acquirer|    8-Nov-02|   18-Mar-05|   TN|     US|
|First Community B...|Fort Myers| FL|34943|              C1 Bank|    2-Aug-13|    9-Feb-17|   FL|     US|
|The National Repu...|   Chicago| IL|  916|  State Bank of Texas|   24-Oct-14|    6-Jan-16|   IL|     US|
|           NOVA Bank|    Berwyn| PA|27148|          No Acquirer|   26-Oct-12|   24-Jan-13|   PA|     US|
+--------------------+----------+---+-----+---------------------+------------+------------+-----+-------+
only showing top 5 rows



# Drop column

In [35]:
drop_column = add_column.drop('CERT')
drop_column.show(5)

+--------------------+----------+---+---------------------+------------+------------+-----+-------+
|           Bank Name|      City| ST|Acquiring Institution|Closing Date|Updated Date|state|country|
+--------------------+----------+---+---------------------+------------+------------+-----+-------+
|              InBank|Oak Forest| IL| MB Financial Bank...|    4-Sep-09|   17-Oct-15|   IL|     US|
|       Bank of Alamo|     Alamo| TN|          No Acquirer|    8-Nov-02|   18-Mar-05|   TN|     US|
|First Community B...|Fort Myers| FL|              C1 Bank|    2-Aug-13|    9-Feb-17|   FL|     US|
|The National Repu...|   Chicago| IL|  State Bank of Texas|   24-Oct-14|    6-Jan-16|   IL|     US|
|           NOVA Bank|    Berwyn| PA|          No Acquirer|   26-Oct-12|   24-Jan-13|   PA|     US|
+--------------------+----------+---+---------------------+------------+------------+-----+-------+
only showing top 5 rows



# Drop multiple columns

In [36]:
drop_column = add_column.drop(*['CERT','ST'])
drop_column.show(5)

+--------------------+----------+---------------------+------------+------------+-----+-------+
|           Bank Name|      City|Acquiring Institution|Closing Date|Updated Date|state|country|
+--------------------+----------+---------------------+------------+------------+-----+-------+
|              InBank|Oak Forest| MB Financial Bank...|    4-Sep-09|   17-Oct-15|   IL|     US|
|       Bank of Alamo|     Alamo|          No Acquirer|    8-Nov-02|   18-Mar-05|   TN|     US|
|First Community B...|Fort Myers|              C1 Bank|    2-Aug-13|    9-Feb-17|   FL|     US|
|The National Repu...|   Chicago|  State Bank of Texas|   24-Oct-14|    6-Jan-16|   IL|     US|
|           NOVA Bank|    Berwyn|          No Acquirer|   26-Oct-12|   24-Jan-13|   PA|     US|
+--------------------+----------+---------------------+------------+------------+-----+-------+
only showing top 5 rows



# Filter data

In [37]:
# Equal
df_select = renamed.where(renamed['state'] == 'NY')
print(df_select.count())
df_select.show()

5
+--------------------+-------------+-----+-----+--------------------+------------+------------+
|           bank_name|         city|state| cert|     acq_institution|closing_date|updated_date|
+--------------------+-------------+-----+-----+--------------------+------------+------------+
|  LibertyPointe Bank|     New York|   NY|58071|Valley National Bank|   11-Mar-10|   23-Aug-12|
|       Reliance Bank| White Plains|   NY|26778|    Union State Bank|   19-Mar-04|    9-Apr-08|
|            USA Bank| Port Chester|   NY|58072|    New Century Bank|    9-Jul-10|   14-Sep-12|
|The Park Avenue Bank|     New York|   NY|27096|Valley National Bank|   12-Mar-10|   23-Aug-12|
|Waterford Village...|Williamsville|   NY|58065|    Evans Bank, N.A.|   24-Jul-09|    1-Nov-13|
+--------------------+-------------+-----+-----+--------------------+------------+------------+



In [38]:
# Between
df_select = renamed.where(renamed['CERT'].between('1000','2000'))
print(df_select.count())
df_select.show()

9
+--------------------+-------------+-----+----+--------------------+------------+------------+
|           bank_name|         city|state|cert|     acq_institution|closing_date|updated_date|
+--------------------+-------------+-----+----+--------------------+------------+------------+
|Barnes Banking Co...|    Kaysville|   UT|1252|         No Acquirer|   15-Jan-10|   23-Aug-12|
|     Mainstreet Bank|  Forest Lake|   MN|1909|        Central Bank|   28-Aug-09|   21-Aug-12|
|     Bank of Ephraim|      Ephraim|   UT|1249|       Far West Bank|   25-Jun-04|    9-Apr-08|
| Citizens State Bank|New Baltimore|   MI|1006|         No Acquirer|   18-Dec-09|   21-Mar-14|
|      Heartland Bank|      Leawood|   KS|1361|        Metcalf Bank|   20-Jul-12|   30-Jul-13|
|Glasgow Savings Bank|      Glasgow|   MO|1056|Regional Missouri...|   13-Jul-12|   19-Aug-14|
|           Hume Bank|         Hume|   MO|1971|       Security Bank|    7-Mar-08|   28-Aug-12|
| Fayette County Bank|   Saint Elmo|   IL|1802|U

In [39]:
# Is in
df_select = renamed.where(renamed['state'].isin('NY','AZ'))
print(df_select.count())
df_select.show()

21
+--------------------+-------------+-----+-----+--------------------+------------+------------+
|           bank_name|         city|state| cert|     acq_institution|closing_date|updated_date|
+--------------------+-------------+-----+-----+--------------------+------------+------------+
|    Copper Star Bank|   Scottsdale|   AZ|35463|  Stearns Bank, N.A.|   12-Nov-10|   20-Aug-12|
|Towne Bank of Ari...|         Mesa|   AZ|57697|Commerce Bank of ...|    7-May-10|   23-Aug-12|
|Western National ...|      Phoenix|   AZ|57917|  Washington Federal|   16-Dec-11|    5-Feb-15|
|  LibertyPointe Bank|     New York|   NY|58071|Valley National Bank|   11-Mar-10|   23-Aug-12|
|       Reliance Bank| White Plains|   NY|26778|    Union State Bank|   19-Mar-04|    9-Apr-08|
|            USA Bank| Port Chester|   NY|58072|    New Century Bank|    9-Jul-10|   14-Sep-12|
|      Bank USA, N.A.|      Phoenix|   AZ|32218|      U.S. Bank N.A.|   30-Oct-09|   22-Aug-12|
|The Park Avenue Bank|     New York| 

# Filter with logical operations

In [40]:
df_select = renamed.where((renamed['state'] == 'NY') & (renamed['city'] == 'New York'))
df_select.show()

+--------------------+--------+-----+-----+--------------------+------------+------------+
|           bank_name|    city|state| cert|     acq_institution|closing_date|updated_date|
+--------------------+--------+-----+-----+--------------------+------------+------------+
|  LibertyPointe Bank|New York|   NY|58071|Valley National Bank|   11-Mar-10|   23-Aug-12|
|The Park Avenue Bank|New York|   NY|27096|Valley National Bank|   12-Mar-10|   23-Aug-12|
+--------------------+--------+-----+-----+--------------------+------------+------------+



# Replace values

In [44]:
replaced = renamed.na.replace(9268, 17)

replaced.show(20)

+--------------------+---------------+-----+-----+--------------------+------------+------------+
|           bank_name|           city|state| cert|     acq_institution|closing_date|updated_date|
+--------------------+---------------+-----+-----+--------------------+------------+------------+
|              InBank|     Oak Forest|   IL|20203|MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|       Bank of Alamo|          Alamo|   TN| 9961|         No Acquirer|    8-Nov-02|   18-Mar-05|
|First Community B...|     Fort Myers|   FL|34943|             C1 Bank|    2-Aug-13|    9-Feb-17|
|The National Repu...|        Chicago|   IL|  916| State Bank of Texas|   24-Oct-14|    6-Jan-16|
|           NOVA Bank|         Berwyn|   PA|27148|         No Acquirer|   26-Oct-12|   24-Jan-13|
|First Cherokee St...|      Woodstock|   GA|32711|Community & South...|   20-Jul-12|    6-Jun-16|
|Net 1st National ...|     Boca Raton|   FL|26652|      Bank Leumi USA|    1-Mar-02|    9-Apr-08|
|       Waccamaw Ban

# Aula Parte 2

## Criação spark e importação dos dados de COVID-19

In [45]:
'''!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j'''

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession.builder.appName("Our First Spark Example").getOrCreate()

spark

In [47]:


import requests
path = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
req = requests.get(path)
url_content = req.content

csv_file_name = 'owid-covid-data.csv'
csv_file = open(csv_file_name, 'wb')

csv_file.write(url_content)
csv_file.close()

df = spark.read.csv('./'+csv_file_name, header=True, inferSchema=True)

## Schema

In [50]:
df.printSchema()

root
 |-- iso_code: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- location: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- total_cases: double (nullable = true)
 |-- new_cases: double (nullable = true)
 |-- new_cases_smoothed: double (nullable = true)
 |-- total_deaths: double (nullable = true)
 |-- new_deaths: double (nullable = true)
 |-- new_deaths_smoothed: double (nullable = true)
 |-- total_cases_per_million: double (nullable = true)
 |-- new_cases_per_million: double (nullable = true)
 |-- new_cases_smoothed_per_million: double (nullable = true)
 |-- total_deaths_per_million: double (nullable = true)
 |-- new_deaths_per_million: double (nullable = true)
 |-- new_deaths_smoothed_per_million: double (nullable = true)
 |-- reproduction_rate: double (nullable = true)
 |-- icu_patients: double (nullable = true)
 |-- icu_patients_per_million: double (nullable = true)
 |-- hosp_patients: double (nullable = true)
 |-- hosp_patients_per_mill

## Conversão variável date para datetime

In [55]:
df.select('date').show(5)

+-------------------+
|               date|
+-------------------+
|2020-01-03 00:00:00|
|2020-01-04 00:00:00|
|2020-01-05 00:00:00|
|2020-01-06 00:00:00|
|2020-01-07 00:00:00|
+-------------------+
only showing top 5 rows



In [56]:
df.select(F.to_date(df.date).alias('date'))

DataFrame[date: date]

In [57]:
df.select('date').show(5)

+-------------------+
|               date|
+-------------------+
|2020-01-03 00:00:00|
|2020-01-04 00:00:00|
|2020-01-05 00:00:00|
|2020-01-06 00:00:00|
|2020-01-07 00:00:00|
+-------------------+
only showing top 5 rows



## Estatísticas

In [58]:
df.describe().show()

+-------+--------+-------------+-----------+-----------------+------------------+------------------+-----------------+-----------------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+------------------+------------------+------------------------+------------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-------------------+------------------+------------------------+----------------------+------------------+-------------------------------+-------------------+------------------+-------------+--------------------+--------------------+-----------------------+--------------------+-----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+-------------------

# Groupby location -> new cases

In [65]:
df.groupBy('location').sum('new_cases').orderBy(F.desc("sum(new_cases)")).show(truncate=False)

+-------------------+--------------+
|location           |sum(new_cases)|
+-------------------+--------------+
|World              |7.70452934E8  |
|High income        |4.25238868E8  |
|Asia               |3.00438143E8  |
|Europe             |2.49217367E8  |
|Upper middle income|2.442881E8    |
|European Union     |1.83832173E8  |
|North America      |1.24412723E8  |
|United States      |1.03436829E8  |
|China              |9.9306563E7   |
|Lower middle income|9.7396408E7   |
|South America      |6.8818417E7   |
|India              |4.499808E7    |
|France             |3.899749E7    |
|Germany            |3.8437756E7   |
|Brazil             |3.7717635E7   |
|South Korea        |3.4436542E7   |
|Japan              |3.3803572E7   |
|Italy              |2.5955703E7   |
|United Kingdom     |2.4688073E7   |
|Russia             |2.2994849E7   |
+-------------------+--------------+
only showing top 20 rows

