In [6]:
import numpy as np
import time
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from memory_profiler import memory_usage

# Simulate data for indexing (a sorted array of keys)
def generate_data(num_keys=1000):
    return np.sort(np.random.randint(1, 100000, size=num_keys))

In [7]:
# Piecewise Linear Implementation
class PiecewiseLinearIndex:
    def __init__(self, num_segments=10):
        self.num_segments = num_segments
        self.models = []
        self.segment_boundaries = []

    def train(self, data):
        segment_size = len(data) // self.num_segments
        for i in range(self.num_segments):
            start_idx = i * segment_size
            end_idx = start_idx + segment_size if i < self.num_segments - 1 else len(data)

            segment_data = data[start_idx:end_idx]
            X = np.arange(len(segment_data)).reshape(-1, 1)
            y = segment_data

            model = LinearRegression().fit(X, y)
            self.models.append(model)
            self.segment_boundaries.append((start_idx, end_idx))

    def query(self, key):
        # Calculate segment size based on the segment boundaries
        segment_size = self.segment_boundaries[0][1] - self.segment_boundaries[0][0]

        # Determine the segment index based on the key
        segment_idx = min(key // segment_size, self.num_segments - 1)
        start_idx = self.segment_boundaries[segment_idx][0]

        # Calculate the relative position within the segment
        relative_pos = key - start_idx

        # Return the prediction for that segment using the linear model
        return self.models[segment_idx].predict([[relative_pos]])[0]

In [8]:
class HybridIndex:
    def __init__(self, num_segments=10, hidden_layer_sizes=(50,), max_iter=1000, learning_rate_init=0.0005):
        self.num_segments = num_segments
        self.models = []
        self.nn_model = MLPRegressor(
            hidden_layer_sizes=hidden_layer_sizes,
            max_iter=max_iter,
            learning_rate_init=learning_rate_init,
            solver='adam',
            early_stopping=True,
            random_state=42
        )

    def train(self, data):
        # Normalize the data
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data.reshape(-1, 1))

        X = np.arange(len(data)).reshape(-1, 1)
        y = data
        self.nn_model.fit(X, y)

    def query(self, key):
        return self.nn_model.predict([[key]])[0]

In [12]:
def measure_performance(index, data, num_queries=1000):
    start_time = time.time()
    queries = [random.choice(data) for _ in range(num_queries)]
    results = [index.query(query) for query in queries]
    query_time = time.time() - start_time

    # Mean Squared Error as accuracy measure
    mse = mean_squared_error(queries, results)
    return query_time, mse

# Function to compare the two models
def compare_models(data):
    piecewise_index = PiecewiseLinearIndex(num_segments=10)
    hybrid_index = HybridIndex(num_segments=10)

    # Measure training time for PiecewiseLinearIndex
    start_time = time.time()
    piecewise_index.train(data)
    piecewise_training_time = time.time() - start_time

    # Measure training time for HybridIndex
    start_time = time.time()
    hybrid_index.train(data)
    hybrid_training_time = time.time() - start_time

    # Measure query time and accuracy for PiecewiseLinearIndex
    piecewise_query_time, piecewise_mse = measure_performance(piecewise_index, data)

    # Measure query time and accuracy for HybridIndex
    hybrid_query_time, hybrid_mse = measure_performance(hybrid_index, data)

    # Print results
    print(f"Piecewise Linear Index (BuzzDB-inspired):")
    print(f"  Training Time: {piecewise_training_time:.4f} seconds")
    print(f"  Query Time (for {len(data)} queries): {piecewise_query_time:.4f} seconds")
    print(f"  Mean Squared Error: {piecewise_mse:.4f}")

    print(f"\nHybrid Index (RMI + Neural Network):")
    print(f"  Training Time: {hybrid_training_time:.4f} seconds")
    print(f"  Query Time (for {len(data)} queries): {hybrid_query_time:.4f} seconds")
    print(f"  Mean Squared Error: {hybrid_mse:.4f}")

    # Memory usage comparison
    print("\nMemory Usage Comparison (in MB):")
    piecewise_mem = max(memory_usage((piecewise_index.train, (data,))))
    hybrid_mem = max(memory_usage((hybrid_index.train, (data,))))
    print(f"  Piecewise Linear Index: {piecewise_mem:.2f} MB")
    print(f"  Hybrid Index: {hybrid_mem:.2f} MB")

# Generate data
data = generate_data(num_keys=1000)

# Run the comparison
compare_models(data)

# Plotting for visual comparison
def plot_comparison():
    segments = [i for i in range(10, 101, 10)]
    piecewise_times = []
    hybrid_times = []

    for seg in segments:
        piecewise_index = PiecewiseLinearIndex(num_segments=seg)
        hybrid_index = HybridIndex(num_segments=seg)

        piecewise_index.train(data)
        hybrid_index.train(data)

        piecewise_query_time, _ = measure_performance(piecewise_index, data)
        hybrid_query_time, _ = measure_performance(hybrid_index, data)

        piecewise_times.append(piecewise_query_time)
        hybrid_times.append(hybrid_query_time)

    plt.plot(segments, piecewise_times, label='Piecewise Linear Index', marker='o')
    plt.plot(segments, hybrid_times, label='Hybrid Index', marker='o')
    plt.xlabel('Number of Segments')
    plt.ylabel('Query Time (seconds)')
    plt.title('Query Time Comparison: Piecewise vs Hybrid Index')
    plt.legend()
    plt.show()




Piecewise Linear Index (BuzzDB-inspired):
  Training Time: 0.0103 seconds
  Query Time (for 1000 queries): 0.1010 seconds
  Mean Squared Error: 26978451499519.3477

Hybrid Index (RMI + Neural Network):
  Training Time: 3.4041 seconds
  Query Time (for 1000 queries): 0.0798 seconds
  Mean Squared Error: 31821581341736.8945

Memory Usage Comparison (in MB):
  Piecewise Linear Index: 226.59 MB
  Hybrid Index: 226.59 MB
