# Data Modeling with Apache Cassandra

## Part I. ETL Pipeline for Pre-Processing the Files

### Imports

In [1]:
import pandas as pd
import cassandra
import re
import os
import glob
import numpy as np
import json
import csv

### Process Data

In [36]:
def process_data(filepath):
    
    # Create list of files
    for root, dirs, files in os.walk(filepath):
        file_path_list = glob.glob(os.path.join(root, '*')) # Everything in event_data folder
    
    # Read-in first data frame
    df = pd.read_csv(file_path_list[0], encoding = 'utf8', na_filter = False)
    
    # Concatenate data frames in file_path_list
    for i in range(1, len(file_path_list)):
        df_to_concat = pd.read_csv(file_path_list[i], encoding = 'utf8', na_filter = False)
        df = pd.concat([df, df_to_concat])
    
    # Save full data to a concated csv
    df.to_csv('./event_datafile_full.csv', index = False)
    
    # Drop empty artist rows
    df = df[df['artist'] != ''] 
    
    # Select Columns of Interest
    df = df[['artist', 'firstName', 'gender', 'itemInSession', 'lastName', 'length', 'level', 'location', 'sessionId', 'song', 'userId']]
    
    return df.reset_index(drop = True)

In [43]:
df = process_data('./event_data')

In [44]:
df.head(3)

Unnamed: 0,artist,firstName,gender,itemInSession,lastName,length,level,location,sessionId,song,userId
0,Harmonia,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",583,Sehr kosmisch,26
1,The Prodigy,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",583,The Big Gundown,26
2,Train,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",583,Marry Me,26


In [49]:
mask1 = df['sessionId'] == 182 
mask2 = df['userId'] == 10

df[mask1 & mask2].sort_values(by = 'itemInSession')

Unnamed: 0,artist,firstName,gender,itemInSession,lastName,length,level,location,sessionId,song,userId
4573,Down To The Bone,Sylvie,F,0,Cruz,333.76608,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",182,Keep On Keepin' On,10
4574,Three Drives,Sylvie,F,1,Cruz,411.6371,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",182,Greece 2000,10
4575,Sebastien Tellier,Sylvie,F,2,Cruz,377.73016,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",182,Kilometer,10
4576,Lonnie Gordon,Sylvie,F,3,Cruz,181.21098,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",182,Catch You Baby (Steve Pitron & Max Sanna Radio...,10


### Connect to Cluster and Create Keyspace

In [41]:
def connect_to_cluster(cluster_location):
    from cassandra.cluster import Cluster
    cluster = Cluster([cluster_location])
    session = cluster.connect()
    return session

### Create Keyspace

In [42]:
def create_keyspace(session, keyspace_name):
    query = f'''CREATE KEYSPACE IF NOT EXISTS {keyspace_name}
           WITH REPLICATION = { 'class': 'SimpleStrategy', 'replication_factor' : 1}'''
    session.execute(query)
    session.set_keyspace(keyspace_name)
    return session

