In [None]:
import random
import csv
import time

'''
Generating CSV file:
- Create possible products, sale dates, and prices
- Write into CSV file given the dataset size as input
'''

# Create class to generate datasets
class Sale:
  def __init__(self, datasize):
    self.datasize = datasize

  def createDate(self): # Possible sale dates
    year = random.randint(2000, 2025)
    month = random.randint(1, 12)
    if month == 2:
      day = random.randint(1, 28)
    elif month % 2 == 0:
      day = random.randint(1, 30)
    else:
      day = random.randint(1, 31)
    if day < 10:
      day = (f"{0}{day}")
    if month < 10:
      month = (f"{0}{month}")
    return (f"{year}-{month}-{day}")

  def createPrice(self): # Possible sale amounts
    integer = random.randrange(50, 305, 5)
    decimals = [random.randrange(0, 95, 5), 99]
    decimal = random.choice(decimals)
    if decimal < 10:
      decimal = (f"{0}{decimal}")
    return  (f"{integer}.{decimal}")

  def productName(self):
    product_names = ['Widget', 'Gadget', 'Thingamajig', 'Doohickey'] # Products being sold
    product = random.choice(product_names)
    return product

  def compiledSales(self, datasize):
    all_sales = []
    for num in range(self.datasize):
      date = self.createDate()
      amount = self.createPrice()
      name = self.productName()
      sale_information = [date, amount, name]
      all_sales.append(sale_information)
    all_sales.sort(reverse = True) # Sort list in descending order (most recent sale first)
    for sale in range(len(all_sales)):
      all_sales[sale].insert(0, sale) # Add sale ID to the first position (0) in each sale
    return all_sales

  def createCSV(self, datasize): # Create CSV file
    with open('salesrecords.csv', 'w', newline = '') as csvfile:
      content = self.compiledSales(datasize)
      writer = csv.writer(csvfile)
      for row in range(len(content)):
        sale = content[row]
        writer.writerow(sale)

  def readCSV(self, datasize): # Check content of generated CSV (I used this as a test, not part of the homework)
    salesrecords_csv = self.createCSV(datasize)
    with open('salesrecords.csv', newline = '') as csvfile:
      reader = csv.reader(csvfile)
      for row in reader:
        print(', '.join(row))


In [None]:
'''
Program to process sales records:
- Includes options for:
  - Load in sales data (read from CSV):
    - Big O time complexity (theoretical) = O(n^n)
  - Retrieve the latest sale:
    - Big O time complexity (theoretical) = O(1)
  - Compute total revenue:
    - Big O time complexity (theoretical) = O(n)
  - Check for duplicate sale IDs
    - Big O time complexity (theoretical) = O(n^2)
  - Search for a sale by its ID
    - Big O time complexity (theoretical) = O(n^2)
'''

class SaleProcessing:
  def __init__(self, datasize):
    self.datasize = datasize

# Load in sale data from CSV
  def load(self):
    start = time.time() # Start timer

    with open('salesrecords.csv', newline = '') as csvfile: # O(1) --> file size does not matter
      reader = csv.reader(csvfile) # O(1)
      for row in reader: # O(n)
        print(', '.join(row)) # O(n)

    stop = time.time() # Stop timer
    duration = stop - start
    print("Time elapsed: ", duration)

# Retrieve the latest sale
  def last(self):
    start = time.time() # Start timer

    with open('salesrecords.csv') as csvfile: # O(1)
      last_sale = csvfile.readline() # O(1)
      print(last_sale) # O(1)

    stop = time.time() # Stop timer
    duration = stop - start
    print("Time elapsed: ", duration)

# Compute total revenue
  def revenue(self):
    start = time.time() # Start timer

    with open('salesrecords.csv') as csvfile: # O(1)
      reader = csv.reader(csvfile) # O(1)
      total = 0 # O(1)
      for row in reader: # O(n)
        total += float(row[2]) # O(1)
      print(total)

    stop = time.time() # Stop timer
    duration = stop - start
    print("Time elapsed: ", duration)

# Check for duplicate sale IDs
  def duplicate(self):
    start = time.time() # Start timer

    with open('salesrecords.csv') as csvfile: # O(1)
      reader = csv.reader(csvfile) # O(1)
      sale_IDs = [] # O(1)
      for row in reader: # O(n)
        sale_IDs.append(row[0])
      if len(set(sale_IDs)) != len(sale_IDs): # O(1)
        result = "Duplicate sale IDs present."
      else:
        result = "Duplicate sale IDs not found."
      print(result)

    stop = time.time() # Stop timer
    duration = stop - start
    print("Time elapsed: ", duration)

# Search for a sale by its ID
  def search(self, datasize):

    while True: # Error handling for sale ID search
      sale_ID = eval(input("Enter a sale ID to search for a sale: "))
      if sale_ID >= datasize:
        print("Please enter a value within the sale ID bounds.")
        continue
      elif sale_ID <= 0:
        print("Please enter a non-negative and non-zero sale ID.")
        continue
      elif int(sale_ID) != sale_ID:
        print("Please enter an integer value.")
        continue
      else:
        break

    start = time.time() # Start timer

    with open('salesrecords.csv') as csvfile: # O(1)
      reader = csv.reader(csvfile) # O(1)
      for row in reader: # O(n)
        if str(sale_ID) in row[0]: # O(n)
          print(', '.join(row)) # O(1)

    stop = time.time() # Stop timer
    duration = stop - start
    print("Time elapsed: ", duration)


In [None]:
# User input for dataset size
while True: # Error handling
  dataset_size = eval(input("Enter dataset size: "))
  if dataset_size <= 0:
    print("Please enter a non-negative and non-zero number.")
    continue
  elif int(dataset_size) != dataset_size:
    print("Please enter an integer value.")
    continue
  else:
    break

# Create dataset
setup_1 = Sale(dataset_size)
setup_1.createCSV(dataset_size)
n = SaleProcessing(dataset_size)

# User interaction with program
options = [0, 1, 2, 3, 4, 5]

while True:
  response = eval(input("Choose an operation (number): 0. Exit Program 1. Load data, 2. Last Sale, 3. Total Revenue, 4. Check Duplicates, 5. Search by Sale ID"))
  if response not in options:
    print("Please enter a valid option.")
  elif response == 1:
    n.load()
  elif response == 2:
    n.last()
  elif response == 3:
    n.revenue()
  elif response == 4:
    n.duplicate()
  elif response == 5:
    n.search(dataset_size)
  else:
    print("Exited Program")
    break

# References
- https://docs.python.org/3/library/csv.html

# Graphs
https://docs.google.com/spreadsheets/d/1rE2t9H8Z6v8zWVc2nteCj0TGukz7iMJQs5QNr6UWRg0/edit?gid=0#gid=0

# Performance Trends:

1.
- How does each operationâ€™s execution time change as the dataset grows?
  
Searching for a sale using the sale ID and loading the data had linear increases in time elapsed relative to the dataset size. Total revenue and checking for duplicates had seemingly linear increases as well, but the time elapsed increases at a faster rate after a dataset size of 10,000. The latest sale showed a logarithmic O(logn) time complexity.

- Do the results align with the theoretical Big O expectations?

The results only aligned with the theoretical Big O expectations I found for the revenue, but none of the other expectations. This might be an error on my part in determining the theoretical values for each operation.

2.
- Real-World Implications: Which steps might become bottlenecks in a production system processing millions of records?

The Search operation would be ideal with larger datasets, since the time elapsed plateaus and slightly decreases as the dataset size increases. However, the other four operations would be more of an issue, since their efficiencies worsen linearly.

- How would you optimize or replace the inefficient (quadratic) approach?

I would avoid nested loops, use break to prevent loops from continuing unnecessarily, and adding cases that could help exit loops early without running the full number of iterations.

3.

- Practical Adjustments: How might you put together a testing plan for this project?

I would adjust my CSV file generation program to have potential errors so that I could see where the processing program fails. Currently, there are no errors (as far as I know of) in the program, so I haven't tested every potential issue. For example, I could add multiple duplicates to the database so that the operation for checking duplicates can be adjusted if necessary.

- What additional error handling or data validation would be necessary?

From the previous question, I think a large issue in this current program is the duplicate checking operation, which has no way to check for multiple duplicates. Also, identifying the duplicates would be a useful practical adjustment.

