# Part I. ETL Pipeline for Pre-Processing the Files

#### Import Python packages 

In [2]:
# Import Python packages 
import pandas as pd
import cassandra
import re
import os
import glob
import numpy as np
import json
import csv
import sys
import ftfy

#### Creating list of filepaths to process original event csv data files

In [None]:
# checking your current working directory
print(os.getcwd())

# Get your current folder and subfolder event data
filepath = os.getcwd() + '/event_data'

# Create a for loop to create a list of files and collect each filepath
for root, dirs, files in os.walk(filepath):
    
# join the file path and roots with the subdirectories using glob
    file_path_list = glob.glob(os.path.join(root,'*'))
    #print(file_path_list)

#### Processing the files to create the data file csv that will be used for Apache Casssandra tables

In [None]:
#Creates a new CSV file, incorporating all CSVs in 'event_data' directory

desired_cols = ['artist','firstName','gender','itemInSession','lastName','length',\
                'level','location','sessionId','song','userId']
header = None

csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

# for every filepath in the file path list 
with open('event_datafile_new.csv', 'w', encoding = 'utf8', newline='') as output_file:
    writer = csv.writer(output_file, dialect='myDialect')
    writer.writerow(desired_cols)
    
    for path in file_path_list:
        # reading input csv file 
        with open(path, 'r', encoding = 'utf8', newline='') as input_file: 
            # creating a csv reader object 
            csvreader = csv.reader(input_file)
            input_header = next(csvreader)

            #Checking columns of all files ordered the same:
            if header != None:
                if header != input_header:
                    raise Exception(f"File {path} has differently-ordered columns")
            else:
                header = input_header
                #Object for ensuring we order columns consistently:
                h_indices = {key: val for val, key in enumerate(header)}

            # extracting each data row one by one andlist append it        
            for line in csvreader:
                if line[0] == '':
                    continue
                #Fixing corrupted characters
                line = [ftfy.fix_text(item) for item in line]
                #Writing columns in order specified by desired_cols
                writer.writerow([line[h_indices[key]] for key in desired_cols])

# Part II. Creating Cassandra tables & inserting data

#### Creating & connecting to a Cluster

In [None]:
contacts = ['127.0.0.1']
port = 9042 #Port 9042 used because I'm connecting to Docker-hosted Cassandra instance which has this port exposed

from cassandra.cluster import Cluster
cluster = Cluster(contact_points = contacts, port = port)

# To establish connection and begin executing queries, need a session
session = cluster.connect()

#### Creating Keyspace

In [None]:
try:
    session.execute('''
    CREATE KEYSPACE IF NOT EXISTS music_sparkify_cassandra
    WITH REPLICATION =
    {'class':'SimpleStrategy','replication_factor':'1'}
    ''')
    
except Exception as e:
    print(e)


#### Setting Keyspace

In [None]:
session.set_keyspace('music_sparkify_cassandra')

In [1]:
def data_insert(table,attributes_dict):
    '''reads in file below and inserts into user-given Cassandra table with user-specified attributes'''
    file = 'event_datafile_new.csv'

    query = make_insert_query_string(table,attributes_dict)

    with open(file, encoding = 'utf8') as f:
        csvreader = csv.reader(f)
        columns = next(csvreader) # record & skip col names
        for line in csvreader:
            values = get_insert_values(columns, attributes_dict,line)
            session.execute(query, tuple(values))

def make_insert_query_string(table, attributes):
    '''Creates basic insert query string for Cassandra given table name and list of values'''
    attributes_str = ", ".join(attributes)
    values_str = ", ".join(["%s"]*len(attributes))
    query = f"INSERT INTO {table} ({attributes_str}) VALUES ({values_str})"
    return query

def get_insert_values(columns, attribute_dict, data):
    '''
    Casts data values as type defined by 'attribute_dict'.
    Uses 'columns' to ensure that the correct index of 'data' is being used.
    Requires that all values in attribute_dict.keys() match an element in 'columns'
    '''
    return [func(data[columns.index(attr)]) for attr,func in attribute_dict.items()]


### Query #1
Give me the artist, song title and song's length in the music app history that was heard during sessionId = 338, and itemInSession = 4

In [None]:
session.execute(
    '''
    CREATE TABLE IF NOT EXISTS songs_by_session
    (sessionId int,
    itemInSession int,
    artist text,
    song text,
    length float,
    PRIMARY KEY (sessionId, itemInSession))
    '''
)

In [3]:
#Inserting data to newly-created table
attributes_dict = {
    'sessionId':int,
    'itemInSession':int,
    'artist':str,
    'song':str,
    'length':float
}
table = 'songs_by_session'

data_insert(table,attributes_dict)

INSERT INTO songs_by_session (sessionId, itemInSession, artist, song, length) VALUES (%s, %s, %s, %s, %s)


In [None]:
#Verifying data insertion through SELECT statement
result = session.execute(
    '''SELECT artist,song,length FROM songs_by_session WHERE sessionId = 338 AND itemInSession = 4'''
)
for row in result:
    print(row)

### Query #2
Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182

In [None]:
session.execute(
    '''
    CREATE TABLE IF NOT EXISTS songsUser_by_sessionUser
    (userId int,
    sessionId int,
    itemInSession int,
    artist text,
    song text,
    firstName text,
    lastName text,
    PRIMARY KEY (userId, sessionId, itemInSession))
    '''
)

In [4]:
#Inserting data to newly-created table
attributes_dict = {
    'userId':int,
    'sessionId':int,
    'itemInSession':int,
    'artist':str,
    'song':str,
    'firstName':str,
    'lastName':str
}
table = 'songsUser_by_sessionUser'

data_insert(table,attributes_dict)

INSERT INTO songsUser_by_sessionUser (sessionId, userId, itemInSession, artist, song, firstName, lastName) VALUES (%s, %s, %s, %s, %s, %s, %s)


In [None]:
#Verifying data insertion through SELECT statement
result = session.execute(
    '''SELECT artist,song,firstName,lastName FROM songsUser_by_sessionUser WHERE userId = 10 AND sessionId = 182 ORDER BY itemInSession'''
)
for row in result:
    print(row)

### Query 3
Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'

In [None]:
session.execute(
    '''
    CREATE TABLE IF NOT EXISTS users_by_song
    (
    artist text,
    song text,
    userId int,
    firstName text,
    lastName text,
    PRIMARY KEY (artist, song, userId))
    '''
)     

In [None]:
#Inserting data to newly-created table
attributes_dict = {
    'artist':str,
    'song':str,
    'userId':int,
    'firstName':str,
    'lastName':str
}

table = 'users_by_song'

data_insert(table,attributes_dict)

In [None]:
#Verifying data insertion through SELECT statement
result = session.execute(
    "SELECT firstName,lastName FROM users_by_song WHERE artist = 'The Black Keys' AND song = 'All Hands Against His Own'"
)

for row in result:
    print(row)

### Drop the tables before closing out the sessions

In [None]:
session.execute("DROP TABLE songs_by_session")

In [None]:
session.execute("DROP TABLE songsUser_by_sessionUser")

In [None]:
session.execute("DROP TABLE users_by_song")

### Close the session and cluster connection¶

In [None]:
session.shutdown()
cluster.shutdown()