# Part I. ETL Pipeline for Pre-Processing the Files

#### Import Python packages 

In [321]:
import pandas as pd
import cassandra
import re
import os
import glob
import numpy as np
import json
import csv
from cassandra.cluster import Cluster

#### Creating list of filepaths to process original event csv data files

In [322]:
filepath = os.getcwd() + '/event_data'

for root, dirs, files in os.walk(filepath):
    file_path_list = glob.glob(os.path.join(root,'*'))
    #print(file_path_list)

#### Processing the files to create the data file csv that will be used for Apache Casssandra tables

In [323]:
class ColumnNames:
    artist = "artist"
    first_name = "firstName"
    last_name = "lastName"
    gender = "gender"
    item_in_session = "itemInSession"
    length = "length"
    level = "level"
    location = "location"
    session_id = "sessionId"
    user_id = "userId"
    song = "song"


full_data_rows_list = []
file = 'event_datafile_new.csv'


for f in file_path_list:

    with open(f, 'r', encoding='utf8', newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)

        for line in csvreader:
            # print(line)
            full_data_rows_list.append(line)

# creating a smaller event data csv file called event_datafile_full csv
# that will be used to insert data into the  Apache Cassandra tables
csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

with open(file, 'w', encoding='utf8', newline='') as f:
    writer = csv.writer(f, dialect='myDialect')
    writer.writerow([ColumnNames.artist, ColumnNames.first_name, ColumnNames.gender,ColumnNames.item_in_session, ColumnNames.last_name, ColumnNames.length,ColumnNames.level, ColumnNames.location, ColumnNames.session_id, ColumnNames.song, ColumnNames.user_id])
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((row[0], row[2], row[3], row[4], row[5],
                        row[6], row[7], row[8], row[12], row[13], row[16]))

#### Processing the files to create the data file csv that will be used for Apache Casssandra tables

In [324]:
# checking the number of rows in the csv file
with open(file, 'r', encoding = 'utf8') as f:
    print(sum(1 for line in f))

6821


#### Creating a Cluster

In [325]:
cluster = Cluster(['db'])

session = cluster.connect()

#### Create Keyspace

In [326]:
session.execute("""
CREATE KEYSPACE IF NOT EXISTS sparkify
  WITH REPLICATION = { 
   'class' : 'SimpleStrategy', 
   'replication_factor' : 1 
  };
""")

<cassandra.cluster.ResultSet at 0x7f75e9704c10>

#### Set Keyspace

In [327]:
session.set_keyspace("sparkify")

# Part II. Create queries to ask the following three questions of the data

###  Question 1. Give me the artist, song title and song's length in the music app history that was heard during  sessionId = 338, and itemInSession  = 4


The Question 1 expects artist, song title and length of a song based on the sessionId and itemInSession attributes.  

As we are working with a NoSQL database, we need to think about the query first which will be used to fetch the data based on which we will create the Table required.


1. The expected output is : "artist, song title and length of a song"
2. Based on : "sessionId and itemInSession"

From the above two points we know the query to get the data will be a SELECT statement like :

`SELECT artist, title of a song, length of a song FROM TABLE_NAME WHERE sessionId = value And itemInSession = value`

As we know the SELECT query, we can move to CREATE table query. We will add `NOT EXIST` to the `CREATE` statement to check if the table exists and only create the table if it does not exist. Now we need to select the columns that are going to be in the table and the PRIMARY KEY.

**Column Names:**

As the query expects artist, title of a song and length of a song on query upon sessionId and itemInSession. Hence the table should have artist, song_title, song_length, session_id and item_in_session columns.

**Primary Key:**

The Primary key for the table should uniquely identify each row in the table. Primary key consists of partition key(which will be used by db to know to which node to route specific query request) and clustering columns(which are used to order rows inside dataset). 

**Partition key**:

As we want to decrease the change of hot spot servers partition key should be as random as possible. In our case combination of session_id and item_in_session is the best choice in this situation as picking one the columns won't work with Apache Cassandra out the box. The only possible option in this case is to use query's `ALLOW FILTERING` option which is worse choice in terms of performance if we are comparing this choice to the previous one.

**Clustering columns:**

We should also add userId to guarantee Primary key uniqueness.

#### Create the table and populate it

In [328]:
session.execute("""
CREATE TABLE IF NOT EXISTS song_session(
   session_id bigint,
   item_in_session int,
   user_id bigint,
   artist text, 
   song_title text, 
   song_length float,
   PRIMARY KEY ((session_id, item_in_session), user_id)
   )
""")
query = session.prepare("""
INSERT INTO song_session(session_id, item_in_session, user_id, artist, song_title, song_length) VALUES (?,?,?,?,?,?)
""")

with open(file, encoding='utf8') as f:
    csvreader = csv.DictReader(f)
    for line in csvreader:
        session.execute(query, (
            int(line[ColumnNames.session_id]),
            int(line[ColumnNames.item_in_session]),
            int(line[ColumnNames.user_id]),
            line[ColumnNames.artist],
            line[ColumnNames.song],
            float(line[ColumnNames.length]),
        ))

#### Get an answer to the question

In [329]:
r = session.execute("""
SELECT artist, song_title, song_length FROM song_session
WHERE session_id = 338 And item_in_session = 4
""")
for x in r:
    print(x)

Row(artist='Faithless', song_title='Music Matters (Mark Knight Dub)', song_length=495.30731201171875)


### Question 2. Give me only the following: name of artist, song (sorted by itemInSession) and user name (first and last name) for userid = 10, sessionid = 182

The Question 2 expects name of artist, song title song (sorted by itemInSession) and user name (first and last name) based on userid and sessionid.  

As we are working with a NoSQL database, we need to think about the query first which will be used to fetch the data based on which we will create the Table required.

1. The expected output is : "Artist name, song title"
2. Based on : "userid and sessionid"
3. Ordered by itemInSession

From the first two points we know the query to get the data will be a SELECT statement like:

` SELECT artist, title of asong, user name FROM TABLE_NAME WHERE user_id = value And session_id = value`

As we know the SELECT query, we can move to CREATE table query. We will add NOT EXIST to the CREATE statement to check if the table exists and only create the table if it does not exist. Now we need to select the columns that are going to be in the table and the PRIMARY KEY.

**Column Names:**

We need artist, title of a song, and user name on query upon userid and session id. And also we need itemInSession for ordering. Hence the table should have artist, username, user_id, session_id and item_in_session columns.

**Primary Key:**

The Primary key for the table should uniquely identify each row in the table. Primary key consists of partition key(which will be used by db to know to which node to route specific query request) and clustering columns(which are used to order rows inside dataset). 

**Partition key:**

As we want to decrease the change of hot spot servers partition key should be as random as possible. In our case combination of session_id and user_id should give us unique values.

**Clustering columns:**

As for the clustering columns we will choose item_in_session as the query results should be sorted by it. 

#### Create the table and populate it

In [330]:
session.execute("""
CREATE TABLE IF NOT EXISTS user_session (
   session_id bigint,
   user_id bigint,
   item_in_session int,
   artist text, 
   song_title text, 
   username text,
   PRIMARY KEY ((session_id, user_id), item_in_session)
   );
""")
query = session.prepare( """
INSERT INTO user_session(session_id, user_id, item_in_session, artist, song_title, username)
VALUES (?,?,?,?,?,?)
""")

with open(file, encoding = 'utf8') as f:
    csvreader = csv.DictReader(f)
    for line in csvreader:
        values=(int(line[ColumnNames.session_id]),
                int(line[ColumnNames.user_id]),
                int(line[ColumnNames.item_in_session]),
                line[ColumnNames.artist], 
                line[ColumnNames.song],
                line[ColumnNames.first_name]+" "+line[ColumnNames.last_name],
                )
#         print(values)
        session.execute(query, values ) 
                    

#### Get an answer to the question

In [331]:
r = session.execute("""
 SELECT artist, song_title, username FROM user_session
 WHERE user_id = 10 And session_id = 182
""")

for x in r:
    print(x)

Row(artist='Down To The Bone', song_title="Keep On Keepin' On", username='Sylvie Cruz')
Row(artist='Three Drives', song_title='Greece 2000', username='Sylvie Cruz')
Row(artist='Sebastien Tellier', song_title='Kilometer', username='Sylvie Cruz')
Row(artist='Lonnie Gordon', song_title='Catch You Baby (Steve Pitron & Max Sanna Radio Edit)', username='Sylvie Cruz')


### Question 3. Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'


The Question 3 expects user name based on attributes song title.  

As we are working with a NoSQL database, we need to think about the query first which will be used to fetch the data based on which we will create the Table required.


1. The expected output is : "user name"
2. Based on : "song title"

From the above two points we know the query to get the data will be a SELECT statement like :

`
SELECT username FROM TABLE_NAME WHERE song=value
`

As we know the SELECT query, we can move to CREATE table query. We will add `NOT EXIST` to the `CREATE` statement to check if the table exists and only create the table if it does not exist. Now we need to select the columns that are going to be in the table and the PRIMARY KEY.

**Column Names:**

As we need user name and song, hence the table should have username and song_title columns.

**Primary Key:**

The Primary key for the table should uniquely identify each row in the table.
Primary key consists of partition key (which will be used by db to know to which node to route specific query request) and clustering columns (which are used to order rows inside dataset).

**Partition key:**

As we want to decrease the chance of hot spot servers partition key should be as random as possible. In this case the `WHERE` statement contains only  1 column - song title so will pick it as a partition column.

**Clustering columns:**

We will pick username as a clustering column to guarantee Primary key uniqueness

#### Create the table and populate it

In [332]:
session.execute("""
CREATE TABLE IF NOT EXISTS user_listening_history(
   username text,
   song text,
   PRIMARY KEY (song, username)
   );
""")
query = session.prepare( """
INSERT INTO user_listening_history(song, username) VALUES (?,?)
""")

with open(file, encoding = 'utf8') as f:
    csvreader = csv.DictReader(f)
    for line in csvreader:
        values=(line[ColumnNames.song],
               line[ColumnNames.first_name]+" "+line[ColumnNames.last_name])
#         print(values)
        session.execute(query, values ) 

#### Get an answer to the question

In [333]:
r = session.execute("""
SELECT username FROM user_listening_history
WHERE song='All Hands Against His Own'
""")
result = [x.username for x in r]
print("Users that listened to the song 'All Hands Against His Own' - ", result)

Users that listened to the song 'All Hands Against His Own' -  ['Jacqueline Lynch', 'Sara Johnson', 'Tegan Levine']


### Drop the tables before closing out the sessions

In [334]:
session.execute("""
DROP TABLE IF EXISTS sparkify.song_session
""")
session.execute("""
DROP TABLE IF EXISTS sparkify.user_session
""")
session.execute("""
DROP TABLE IF EXISTS sparkify.user_listening_history
""")

<cassandra.cluster.ResultSet at 0x7f75a2c48970>

### Close the session and cluster connection¶

In [335]:
session.shutdown()
cluster.shutdown()