# Parte 1: Pipeline ETL para processar os dados

### Importando libs

In [2]:
!pip install cassandra-driver

Collecting cassandra-driver
  Downloading cassandra_driver-3.24.0-cp37-cp37m-win_amd64.whl (2.7 MB)
Collecting geomet<0.3,>=0.1
  Using cached geomet-0.2.1.post1-py3-none-any.whl (18 kB)
Installing collected packages: geomet, cassandra-driver
Successfully installed cassandra-driver-3.24.0 geomet-0.2.1.post1


In [3]:
import pandas as pd
import cassandra
import re
import os
import json
import csv
import numpy as np
import glob

### Criando uma lista de caminhos para processar os arquivos originais e csv dos eventos

In [6]:
# Diretório atual
print(f'Current working directory: {os.getcwd()}')

# Diretório dos arquivos de eventos
filepath = os.getcwd() + '/event_data'

# Cria uma lista de arquivos e coleta cada caminho
for root, dirs, files in os.walk(filepath):
    # Junta cada caminho e seu diretório raíz e subdiretórios usando o glob
    file_path_list = glob.glob(os.path.join(root, '*'))

print(file_path_list[0])
print(len(file_path_list))

Current working directory: C:\Users\Gilberto\Desktop\data_science\data_engineering\projeto_02
C:\Users\Gilberto\Desktop\data_science\data_engineering\projeto_02/event_data\2018-11-01-events.csv
30


## Processando os arquivos em um csv que será utilizado nas tabelas do Cassandra

In [8]:
# Inicializando uma lista vazia que será preenchida com as linhas de cada arquivo
full_data_rows_list = []

# para cada caminho na nossa lista
for f in file_path_list:
    # lendo o arquivo csv
    with open(f, 'r', encoding='utf8', newline='') as csvfile:
        # cria um novo obj leitor de csv
        csvreader = csv.reader(csvfile)
        next(csvreader)
        
        # extraindo os dados de cada linha
        for line in csvreader:
            full_data_rows_list.append(line)
    
print(f'Total rows : {len(full_data_rows_list)}')
print(f'Sample data:\n {full_data_rows_list[:5]}')

# Criando um arquivo único de csv que será chamado pelas rotinas que transformarão ele em uma tabela do cassandra
csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

with open('event_data_processed.csv', 'w', encoding='utf8', newline='') as f:
    writer = csv.writer(f, dialect='myDialect')
    writer.writerow(['artist', 'firstName', 'gender', 'itemInSession', 'lastName', 'length', 'level', 'location', 'sessionId', 'song', 'userId'])
    
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))

Total rows : 8056
Sample data:
 [['', 'Logged In', 'Walter', 'M', '0', 'Frye', '', 'free', 'San Francisco-Oakland-Hayward, CA', 'GET', 'Home', '1.54092E+12', '38', '', '200', '1.54111E+12', '39'], ['', 'Logged In', 'Kaylee', 'F', '0', 'Summers', '', 'free', 'Phoenix-Mesa-Scottsdale, AZ', 'GET', 'Home', '1.54034E+12', '139', '', '200', '1.54111E+12', '8'], ["Des'ree", 'Logged In', 'Kaylee', 'F', '1', 'Summers', '246.30812', 'free', 'Phoenix-Mesa-Scottsdale, AZ', 'PUT', 'NextSong', '1.54034E+12', '139', 'You Gotta Be', '200', '1.54111E+12', '8'], ['', 'Logged In', 'Kaylee', 'F', '2', 'Summers', '', 'free', 'Phoenix-Mesa-Scottsdale, AZ', 'GET', 'Upgrade', '1.54034E+12', '139', '', '200', '1.54111E+12', '8'], ['Mr Oizo', 'Logged In', 'Kaylee', 'F', '3', 'Summers', '144.03873', 'free', 'Phoenix-Mesa-Scottsdale, AZ', 'PUT', 'NextSong', '1.54034E+12', '139', 'Flat 55', '200', '1.54111E+12', '8']]


In [9]:
# checando o numero de linhas no novo arquivo csv
with open('event_data_processed.csv', 'r', encoding='utf8') as f:
    print(sum(1 for line in f))

6821


## Agora podemos trabalhar com o arquivo de csv processado. Nele temos as seguintes colunas:

- artist
- firstName of user
- gender of user
- item number in session
- last name of user
- length of the song
- level (paid or free song)
- location of the user
- sessionId
- song title
- userId


### Criando um cluster

#### Docker
```shell
$ docker run --name cassandraDb -d -p 7199:7199 -p 7000:7000 -p 9042:9042 -p 9160:9160 -p 7001:7001 cassandra:3.11
```

In [11]:
from cassandra.cluster import Cluster

try:
    cluster = Cluster(['127.0.0.1'])
    session = cluster.connect()
    print('Connection Established!')
except Exception as e:
    print(f'Connection Failed. Error: {e}')


Connection Established!


### Criando um keyspace

In [12]:
keyspace_query = """CREATE KEYSPACE IF NOT EXISTS sparkify
    with REPLICATION =
    { 'class': 'SimpleStrategy', 'replication_factor': 1 }
"""

# criando o keyspace
try:
    session.execute(keyspace_query)
except Exception as e:
    print(f"Failed to create keyspace! Error : {e}")

### 