## Imports

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Final Project").getOrCreate()
print(spark)

import random
import concurrent.futures
import time
from pyspark.sql.functions import col, when
from pyspark.sql.functions import min as spark_min, max as spark_max

<pyspark.sql.session.SparkSession object at 0x7f8fe0930780>


In [1]:
!python --version
!pyspark --version

Python 2.7.17


## Data loading

In [2]:
df = spark.read.csv("hdfs://localhost:9000/user/input/data.csv", header=True, inferSchema=True)
rdd = df.rdd
df = df.drop('Timestamp')
df.show(3)

+--------------+----------+------+
|        UserId| ProductId|Rating|
+--------------+----------+------+
|A39HTATAQ9V7YF|0205616461|   5.0|
|A3JM6GV9MNOF9X|0558925278|   3.0|
|A1Z513UWSAAO0F|0558925278|   5.0|
+--------------+----------+------+
only showing top 3 rows



## Project needs to read dataset categorize it. 

### Average Rating per Product

In [12]:
# Return all Product Ratings for each User, you have three columns (UserId, ProductId, Rating)
df.groupBy("ProductId").avg("Rating").withColumnRenamed("avg(Rating)", "AverageRating").show(3)

+----------+-------------+
| ProductId|AverageRating|
+----------+-------------+
|9790773587|          5.0|
|9790794207|          5.0|
|B00004VBMM|          5.0|
+----------+-------------+
only showing top 3 rows



In [7]:
def get_avg_rating(df):
    return df.groupBy("ProductId").avg("Rating").withColumnRenamed("avg(Rating)", "AverageRating")

### Average Rating per User

In [13]:
df.groupBy("UserId").avg("Rating").withColumnRenamed("avg(Rating)", "AverageRating").show(3)

+--------------+-------------+
|        UserId|AverageRating|
+--------------+-------------+
|A2HNQ3JHXDSVMW|          3.0|
|A2DOQ89OLXNHNL|          5.0|
|A17U6P3YQISHYH|          4.8|
+--------------+-------------+
only showing top 3 rows



In [8]:
def get_avg_rating(df):
    return df.groupBy("UserId").avg("Rating").withColumnRenamed("avg(Rating)", "AverageRating")

## It should be possible to find lowest and highest scores in dataset

In [14]:
min_rating = df.select(spark_min("Rating")).collect()[0][0]
max_rating = df.select(spark_max("Rating")).collect()[0][0]
print("Min rating: ", min_rating)
print("Max rating: ", max_rating)

Min rating:  1.0
Max rating:  5.0


In [3]:
def get_min_rating(df):
    return df.select(spark_min("Rating")).collect()[0][0]

In [4]:
def get_max_rating(df):
    return df.select(spark_max("Rating")).collect()[0][0]

## Add new data

In [31]:
def add_data(df, new_data, columns, spark=SparkSession.builder.appName("Product Rating Analysis").getOrCreate()):
    new_df = spark.createDataFrame(new_data, schema=columns)
    return df.union(new_df)

In [37]:
input = ['A2HNQ3JHXDSVMW', 'B0000C321X', 5]
user_id, product_id, new_rating = input
df = add_data(df, [(user_id, product_id, new_rating)], df.columns)
df.filter((col("UserId") == user_id) & (col("ProductId") == product_id)).show()

+--------------+----------+------+
|        UserId| ProductId|Rating|
+--------------+----------+------+
|A2HNQ3JHXDSVMW|B0000C321X|   5.0|
+--------------+----------+------+



## Update existing one

In [46]:
def update_data(df, user_id, product_id, new_rating):
    condition = (col("UserId") == user_id) & (col("ProductId") == product_id)
    if df.filter(condition).count() > 0:
        df = df.withColumn(
            'Rating',
            when(condition, new_rating).otherwise(col("Rating"))
        )
        print(f"Updated UserId: {user_id}, ProductId: {product_id} with new Rating: {new_rating}")
        df.filter(condition).show()
        return df
    else:
        print(f"No entry found for UserId: {user_id}, ProductId: {product_id}")

    return df

In [50]:
df = update_data(df, 'A2HNQ3JHXDSVMW' , 'B0000C321X', 4)

Updated UserId: A2HNQ3JHXDSVMW, ProductId: B0000C321X with new Rating: 4
+--------------+----------+------+
|        UserId| ProductId|Rating|
+--------------+----------+------+
|A2HNQ3JHXDSVMW|B0000C321X|   4.0|
+--------------+----------+------+



## Stress Test 1: The client makes the same request very quickly min (10000 times).

In [53]:
def task_1():
    for _ in range(10000):
        get_avg_rating(df)

In [55]:
print("Running Stress Test 1: Single client making the same request very quickly")
start_time = time.time()
task_1()
print(f"Stress Test 1 completed in {time.time() - start_time:.2f} seconds")

Running Stress Test 1: Single client making the same request very quickly
Stress Test 1 completed in 44.23 seconds


## Stress Test 2: Two or more clients make the possible requests randomly (10000 times).

In [6]:
counter = 0
def task_2(no_requests):
    global counter
    requests = [random.randint(1, 100) for _ in range(no_requests)]
    for req in requests:
        if req % 2 == 0:
            counter += 1
            print(f"Counter: {counter}, Client: {no_requests}")
            get_max_rating(df)
        else:
            counter += 1
            print(f"Counter: {counter}, Client: {no_requests}")
            get_min_rating(df)

In [7]:
print("Running Stress Test 2: Multiple clients making requests randomly")
start_time = time.time()

client1 = random.randint(1, 10000)
client2 = random.randint(1, 10000 - client1)
client3 = 10000 - client1 - client2

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(task_2, client1), executor.submit(task_2, client2), executor.submit(task_2, client3)]
    try:
        concurrent.futures.wait(futures)
    except KeyboardInterrupt:
        executor.shutdown(wait=False) 
        print("Tasks interrupted and executor shut down.")

print(f"Stress Test 2 completed in {time.time() - start_time:.2f} seconds")

Running Stress Test 2: Multiple clients making requests randomly


## Stress Test 3: System has to make some processing base on all data it has and handle at least (1000 in short period of time)

In [60]:
def remove_data(df, user_id, product_id):
    condition = (col("UserId") == user_id) & (col("ProductId") == product_id)
    if df.filter(condition).count() > 0:
        df = df.filter(~condition)
        # print(f"Removed UserId: {user_id}, ProductId: {product_id}")
        return df
    else:
        print(f"No entry found for UserId: {user_id}, ProductId: {product_id}")

    return df

In [59]:
def task_3_add():
    for i in range(1000):
        add_data(df, [(i, f"Product_{i}", random.randint(1, 5))], ["UserId", "ProductId", "Rating"])
        
def task_3_remove():
    for i in range(1000):
        remove_data(df, [(i, f"Product_{i}", random.randint(1, 5))], ["UserId", "ProductId", "Rating"])

In [62]:
print("Running Stress Test 3: System processing a large load of data quickly")
start_time = time.time()
task_3_add()
print(f"Stress Test 3 completed in {time.time() - start_time:.2f} seconds")

Running Stress Test 3: System processing a large load of data quickly
Stress Test 3 completed in 9.81 seconds


In [63]:
print("Running Stress Test 3: System processing a large load of data quickly")
start_time = time.time()
task_3_remove()
print(f"Stress Test 3 completed in {time.time() - start_time:.2f} seconds")

Running Stress Test 3: System processing a large load of data quickly
Stress Test 3 completed in 9.45 seconds
