### Block used for cassandra functions

In [2]:
import os
import sys
import numpy
module_path = os.path.abspath(os.path.join('../cassandra'))
if module_path not in sys.path:
    sys.path.append(module_path)

ModuleNotFoundError: No module named 'numpy'

### Import all functions need for this test

In [5]:
from cassandra.cluster import Cluster
from functions import *
from cassandra import ConsistencyLevel
from datetime import datetime
import time

### Define Keyspace and init cluster

In [None]:
KEYSPACE = 'benchmark'
cluster = Cluster()


### Parse CSV file into array and fine its header

In [None]:

filename = '../tmdb_5000_movies.csv'
csv_data = parse_CSV(filename)
csv_header = csv_data.pop(0)

### Change the array into dictionary based on its header

In [None]:
movies_table = []

for row in csv_data:
    dict_row = {}
    for i, col_name in enumerate(csv_header):
        dict_row[col_name] = row[i]
    movies_table.append(dict_row)

### Init KeySpace

In [None]:

session = cluster.connect()
drop_keyspace(session, KEYSPACE)
create_keyspace(session, KEYSPACE)

session.set_keyspace(KEYSPACE)

### Prepare queries

In [None]:
create_movies_table_query = """
                                CREATE TABLE IF NOT EXISTS benchmark.movies (
                                type text,
                                id int,
                                title text,
                                popularity float,
                                vote_average float,
                                runtime int,
                                budget varint,
                                revenue varint,
                                release_date timestamp,
                                PRIMARY KEY (type, revenue)
                                );
                            """

insert_movie_query = """
                        INSERT INTO benchmark.movies (type, id, title, popularity, vote_average, runtime, budget, revenue, release_date) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
                     """

In [10]:
# Movies data processing start

print("Starting timing query...")
start = time.time()
# Create table
session.execute(create_movies_table_query)
# Prepare queries for insert data
prepared_insert_movie_query = session.prepare(insert_movie_query)
# Set consistency level
prepared_insert_movie_query.consistency_level = ConsistencyLevel.LOCAL_ONE
# Insert data into table
timing = []
for _ in range(3):
    for movie in movies_table:
        session.execute(prepared_insert_movie_query, (
                                                     'movie',
                                                     int(movie['id']),
                                                     movie['title'],
                                                     float(movie['popularity']),
                                                     float(movie['vote_average']),
                                                     int(get_safe_string(movie['runtime'].split('.')[0])),
                                                     int(movie['budget']),
                                                     int(movie['revenue']),
                                                     datetime.strptime(get_safe_date(movie['release_date']), '%Y-%m-%d')))
    # Select top 10 movies with the highest revenue
    select_user_query = """
                            SELECT * FROM benchmark.movies WHERE type='movie' ORDER BY revenue DESC LIMIT 10;
                        """
    # Output
    rows = session.execute(select_user_query)
    session.shutdown()
    for row in (rows):
        print("{title:50} {:,}".format(row.revenue, title=row.title))

    end = time.time()
    timing.append(end - start)

plt.plot([1,2,3], timing, 'ro')
plt.axis([0, 3, 0, 5])
plt.show()

ModuleNotFoundError: No module named 'matplotlib'