# Importação de pacotes

In [5]:
from pyspark.sql import Row,DataFrame
from pyspark.sql.types import StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, expr, lit,substring,concat, concat_ws, when, coalesce
from pyspark.sql import functions as functions
from functools import reduce

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [6]:
spark

# Leitura dos dados

In [7]:
dados_path = './banklist.csv'

dados = spark.read.csv(dados_path, sep=',', inferSchema=True, header=True)

In [9]:
print(f'Length: {dados.count()}')
print(f'N cols: {len(dados.columns)}')
print(f'Columns: {dados.columns}')

Length: 553
N cols: 7
Columns: ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date', 'Updated Date']


In [10]:
dados.show(5)

+--------------------+------------------+---+-----+---------------------+------------+------------+
|           Bank Name|              City| ST| CERT|Acquiring Institution|Closing Date|Updated Date|
+--------------------+------------------+---+-----+---------------------+------------+------------+
| Fayette County Bank|        Saint Elmo| IL| 1802| United Fidelity B...|   26-May-17|   26-Jul-17|
|Guaranty Bank, (d...|         Milwaukee| WI|30003| First-Citizens Ba...|    5-May-17|   26-Jul-17|
|      First NBC Bank|       New Orleans| LA|58302|         Whitney Bank|   28-Apr-17|   26-Jul-17|
|       Proficio Bank|Cottonwood Heights| UT|35495|    Cache Valley Bank|    3-Mar-17|   18-May-17|
|Seaway Bank and T...|           Chicago| IL|19328|  State Bank of Texas|   27-Jan-17|   18-May-17|
+--------------------+------------------+---+-----+---------------------+------------+------------+
only showing top 5 rows



# SQL + Spark

In [14]:
dados.createOrReplaceTempView('banklist')

df_check = spark.sql('''select `Bank Name`, City, `Closing Date` from banklist''')
df_check.show()

+--------------------+------------------+------------+
|           Bank Name|              City|Closing Date|
+--------------------+------------------+------------+
| Fayette County Bank|        Saint Elmo|   26-May-17|
|Guaranty Bank, (d...|         Milwaukee|    5-May-17|
|      First NBC Bank|       New Orleans|   28-Apr-17|
|       Proficio Bank|Cottonwood Heights|    3-Mar-17|
|Seaway Bank and T...|           Chicago|   27-Jan-17|
|Harvest Community...|        Pennsville|   13-Jan-17|
|         Allied Bank|          Mulberry|   23-Sep-16|
|The Woodbury Bank...|          Woodbury|   19-Aug-16|
|First CornerStone...|   King of Prussia|    6-May-16|
|  Trust Company Bank|           Memphis|   29-Apr-16|
|North Milwaukee S...|         Milwaukee|   11-Mar-16|
|Hometown National...|          Longview|    2-Oct-15|
| The Bank of Georgia|    Peachtree City|    2-Oct-15|
|        Premier Bank|            Denver|   10-Jul-15|
|      Edgebrook Bank|           Chicago|    8-May-15|
|         

# Operações básicas

In [19]:
dados.describe().show()

+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+
|summary|           Bank Name|   City|  ST|             CERT|Acquiring Institution|Closing Date|Updated Date|
+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+
|  count|                 553|    553| 553|              553|                  553|         553|         553|
|   mean|                null|   null|null|31729.65280289331|                 null|        null|        null|
| stddev|                null|   null|null|16420.59489355429|                 null|        null|        null|
|    min|1st American Stat...|Acworth|  AL|               91|      1st United Bank|    1-Aug-08|    1-Aug-13|
|    max|               ebank|Wyoming|  WY|            58701|  Your Community Bank|    9-Sep-11|    9-Sep-12|
+-------+--------------------+-------+----+-----------------+---------------------+------------+------------+



In [20]:
dados.describe('CERT','ST').show()

+-------+-----------------+----+
|summary|             CERT|  ST|
+-------+-----------------+----+
|  count|              553| 553|
|   mean|31729.65280289331|null|
| stddev|16420.59489355429|null|
|    min|               91|  AL|
|    max|            58701|  WY|
+-------+-----------------+----+



In [24]:
print(dados.dtypes)
print(dados.printSchema())

[('Bank Name', 'string'), ('City', 'string'), ('ST', 'string'), ('CERT', 'int'), ('Acquiring Institution', 'string'), ('Closing Date', 'string'), ('Updated Date', 'string')]
root
 |-- Bank Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- ST: string (nullable = true)
 |-- CERT: integer (nullable = true)
 |-- Acquiring Institution: string (nullable = true)
 |-- Closing Date: string (nullable = true)
 |-- Updated Date: string (nullable = true)

None


# Remover duplicados

In [26]:
dados = dados.dropDuplicates()

print(f'Length: {dados.count()}')
print(f'N cols: {len(dados.columns)}')

Length: 553
N cols: 7


Não há dados duplicados

# Seleção de colunas

In [30]:
select = dados.select('Bank Name', 'CERT')
select.show(5)

+--------------------+-----+
|           Bank Name| CERT|
+--------------------+-----+
|              InBank|20203|
|       Bank of Alamo| 9961|
|First Community B...|34943|
|The National Repu...|  916|
|           NOVA Bank|27148|
+--------------------+-----+
only showing top 5 rows



In [29]:
print(f'Length: {select.count()}')
print(f'N cols: {len(select.columns)}')

Length: 553
N cols: 2


# Renomear colunas

In [33]:
renamed = dados \
    .withColumnRenamed('Bank Name', 'bank_name')\
    .withColumnRenamed('Acquiring Institution', 'acq_institution')\
    .withColumnRenamed('Closing Date', 'closing_date')\
    .withColumnRenamed('Update Date', 'update_date')\
    .withColumnRenamed('ST', 'state')\
    .withColumnRenamed('CERT', 'cert')\
    .withColumnRenamed('City', 'cty')

In [34]:
renamed.show(5)

+--------------------+----------+-----+-----+--------------------+------------+------------+
|           bank_name|       cty|state| cert|     acq_institution|closing_date|Updated Date|
+--------------------+----------+-----+-----+--------------------+------------+------------+
|              InBank|Oak Forest|   IL|20203|MB Financial Bank...|    4-Sep-09|   17-Oct-15|
|       Bank of Alamo|     Alamo|   TN| 9961|         No Acquirer|    8-Nov-02|   18-Mar-05|
|First Community B...|Fort Myers|   FL|34943|             C1 Bank|    2-Aug-13|    9-Feb-17|
|The National Repu...|   Chicago|   IL|  916| State Bank of Texas|   24-Oct-14|    6-Jan-16|
|           NOVA Bank|    Berwyn|   PA|27148|         No Acquirer|   26-Oct-12|   24-Jan-13|
+--------------------+----------+-----+-----+--------------------+------------+------------+
only showing top 5 rows

