# Part I. ETL Pipeline for Pre-Processing the Files

## PLEASE RUN THE FOLLOWING CODE FOR PRE-PROCESSING THE FILES

#### Import Python packages 

In [13]:
# Import Python packages 
import pandas as pd
import cassandra
import re
import os
import glob
import numpy as np
import json
import csv

#### Creating list of filepaths to process original event csv data files

In [17]:
# checking your current working directory
print(os.getcwd())

# Get your current folder and subfolder event data
filepath = os.getcwd() + '/event_data'

# Create a for loop to create a list of files and collect each filepath
for root, dirs, files in os.walk(filepath):
    
# join the file path and roots with the subdirectories using glob
    file_path_list = glob.glob(os.path.join(root,'*'))
    #print(file_path_list)

C:\src\personal\cassandra_etl


#### Processing the files to create the data file csv that will be used for Apache Cassandra tables

In [24]:
# initiating an empty list of rows that will be generated from each file
full_data_rows_list = [] 
    
# for every filepath in the file path list 
for f in file_path_list:

# reading csv file 
    with open(f, 'r', encoding = 'utf8', newline='') as csvfile: 
        # creating a csv reader object 
        csvreader = csv.reader(csvfile) 
        next(csvreader)
        
 # extracting each data row one by one and append it        
        for line in csvreader:
            #print(line)
            full_data_rows_list.append(line) 
            
print("Found %s rows of data." %len(full_data_rows_list))

# creating a smaller event data csv file called event_datafile_full csv that will be used to insert data into the \
# Apache Cassandra tables
csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)

new_filename = 'event_datafile_new.csv'
print('Writing data to %s' %new_filename)
with open(new_filename, 'w', encoding = 'utf8', newline='') as f:
    writer = csv.writer(f, dialect='myDialect')
    writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\
                'level','location','sessionId','song','userId'])
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))
print('Done writing data to new file.')

Found 8056 rows of data.
Writing data to event_datafile_new.csv
Done writing data to new file.


In [22]:
# check the number of rows in your csv file
with open('event_datafile_new.csv', 'r', encoding = 'utf8') as f:
    print("New data file consum(1 for line in f))

6821


# Part II. Complete the Apache Cassandra coding portion of your project. 

## Now you are ready to work with the CSV file titled <font color=red>event_datafile_new.csv</font>, located within the Workspace directory.  The event_datafile_new.csv contains the following columns: 
- artist 
- firstName of user
- gender of user
- item number in session
- last name of user
- length of the song
- level (paid or free song)
- location of the user
- sessionId
- song title
- userId

The image below is a screenshot of what the denormalized data should appear like in the <font color=red>**event_datafile_new.csv**</font> after the code above is run:<br>

<img src="images/image_event_datafile_new.jpg">

## Begin writing your Apache Cassandra code in the cells below

#### Creating a Cluster

In [25]:
# This should make a connection to a Cassandra instance your local machine 
# (127.0.0.1)

from cassandra.cluster import Cluster
cluster = Cluster()

# To establish connection and begin executing queries, need a session
session = cluster.connect()

#### Create Keyspace

In [38]:
session.execute("CREATE KEYSPACE IF NOT EXISTS sparkify WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };")

<cassandra.cluster.ResultSet at 0x1584869c9e8>

#### Set Keyspace

In [41]:
session.execute("USE sparkify;")

<cassandra.cluster.ResultSet at 0x158486964a8>

### Helper Methods
A few methods to help make data insertion easier

In [70]:
DATA_FILE = 'event_datafile_new.csv'

def get_data_by_key(row, key):
    """ Helper method which maps column keywords to the value in the 
        data row which has been parse to the appropriate data type 
        
        Arguments:
            row {str[]} -- [represents a single row from event_datafile_new.csv, split by column]
            key {str} -- [the column which you want to parse]
    """
    
    map = {
        'artist': row[0],
        'firstName': row[1],
        'gender': row[2],
        'itemInSession': int(row[3]),
        'lastName': row[4],
        'length': float(row[5]),
        'level': row[6],
        'location': row[7],
        'sessionId': int(row[8]),
        'song': row[9],
        'userId': int(row[10])
    }
    return map[key]

def insert_data_into_table(insertStatement, dataToBeInserted):
    """ Helper method which executes the provided CQL statement, passing in the
        values represented by the keys passed in
        
        Arguments:
            insertStatement {str} -- [CQL insert statement where each data to be inserted has been replaced with %s]
            dataToBeInserted {str[]} -- [array of keys (in order) to be inserted from each row in event_datafile_new.csv]
    """

    with open(DATA_FILE, encoding = 'utf8') as f:
        csvreader = csv.reader(f)
        next(csvreader) # skip header
        for line in csvreader:
            session.execute(insertStatement, (get_data_by_key(line, key) for key in dataToBeInserted))

## Create queries to ask the following three questions of the data

### 1. Find the artist, song title and song's length in the music app history that was heard during sessionId = 338, and itemInSession  = 4    

In [73]:
song_sessions_table_drop = "DROP TABLE IF EXISTS song_sessions"

song_sessions_table_create = """
CREATE TABLE IF NOT EXISTS song_sessions(
    session_id INT, 
    item_in_session INT,
    artist VARCHAR,
    song_title VARCHAR,
    length DECIMAL,
    PRIMARY KEY (session_id, item_in_session)
);
"""

song_sessions_insert = """
INSERT INTO song_sessions (session_id, item_in_session, artist, song_title, length)
    VALUES (%s, %s, %s, %s, %s)
"""

song_sessions_select = """
SELECT artist, song_title, length
FROM song_sessions
WHERE session_id = 338
AND item_in_session = 4
"""

# Drop and Create Table
session.execute(song_sessions_table_drop)
session.execute(song_sessions_table_create)

# Insert Dat into Table
insert_data_into_table(song_sessions_insert, ['sessionId', 'itemInSession', 'artist', 'song', 'length'])

# Query table to answer question
rows = session.execute(song_sessions_select)
for row in rows:
    print(row.artist, row.song_title, row.length)

Faithless Music Matters (Mark Knight Dub) 495.3073


### 2. Find only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182

In [87]:
user_sessions_table_drop = "DROP TABLE IF EXISTS user_sessions"

user_sessions_table_create = """
CREATE TABLE IF NOT EXISTS user_sessions(
    user_id INT,
    session_id INT, 
    item_in_session INT,
    artist VARCHAR,
    song_title VARCHAR,
    first_name VARCHAR,
    last_name VARCHAR,
    PRIMARY KEY (user_id, session_id, item_in_session)
);
"""

user_sessions_insert = """
INSERT INTO user_sessions (user_id, session_id, item_in_session, artist, song_title, first_name, last_name)
    VALUES (%s, %s, %s, %s, %s, %s, %s)
"""

user_sessions_select = """
SELECT artist, song_title, first_name, last_name
FROM user_sessions
WHERE user_id = 10
AND session_id = 182 
ORDER BY session_id, item_in_session
"""

# Drop and Create Table
# session.execute(user_sessions_table_drop)
# session.execute(user_sessions_table_create)

# Insert Dat into Table
# insert_data_into_table(user_sessions_insert, ['userId', 'sessionId', 'itemInSession', 'artist', 'song', 'firstName', 'lastName'])

# Query table to answer question
rows = session.execute(user_sessions_select)
for row in rows:
    print("%20s\t%55s\t%10s\t%10s" %(row.artist, row.song_title, row.first_name, row.last_name))

    Down To The Bone	                                     Keep On Keepin' On	    Sylvie	      Cruz
        Three Drives	                                            Greece 2000	    Sylvie	      Cruz
   Sebastien Tellier	                                              Kilometer	    Sylvie	      Cruz
       Lonnie Gordon	   Catch You Baby (Steve Pitron & Max Sanna Radio Edit)	    Sylvie	      Cruz


### 3. Find every user name (first and last) in music app history who listened to the song 'All Hands Against His Own'

In [None]:
## TO-DO: Query 3: Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'


                    

### Drop the tables before closing out the sessions

In [None]:
drop_statements = [song_session_table_drop, user_sessions_table_drop]
for drop in drop_statements:
    session.execute(drop)

### Close the session and cluster connection¶

In [None]:
session.shutdown()
cluster.shutdown()