In [8]:
from os import listdir, path
from os.path import isfile, join
import re
import csv
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit


def read_particles_file(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            threads_number = float(row['threads_number'])
            average = float(row['average'])
            yield threads_number, average


def read_data(particles_file_pattern, dir_path='stats/sieldhit/run_regression'):
    x_threads = []
    x_particles = []
    y_average = []
    for f in listdir(dir_path):
        f = join(dir_path, f)
        if isfile(f):
            match = re.match(particles_file_pattern, path.basename(f))
            if match is None:
                continue
            particles = int(match.group(1))
            for (threads_number, average) in read_particles_file(f):
                x_threads.append(threads_number)
                x_particles.append(particles)
                y_average.append(average)
    return x_threads, x_particles, y_average


In [9]:
particles_file_pattern = r'particles_(\d+)'

x_threads, x_particles, y_average = read_data(particles_file_pattern)

In [10]:
def func(X, a, b):
    threads, particles = X
    return a * (1 / threads) + b * particles

In [11]:
popt, pcov = curve_fit(func, (x_threads, x_particles), y_average)
popt

array([1.48334444e+04, 3.04778764e-05])

In [12]:
def estimate(threads, particles):
    return popt[0] * (1 / threads) + popt[1] * particles


In [13]:
particles_file_pattern_test = r'control_particles_(\d+)'

x_threads_test, x_particles_test, y_average_test = read_data(particles_file_pattern_test)

In [14]:
estimated_result = [func((x, y), popt[0], popt[1]) for x, y in zip(x_threads_test, x_particles_test)]

plt.plot(x_threads_test, y_average_test, '+', label='data')
plt.plot(x_threads_test, estimated_result, '+', label='estimated')

plt.legend()
plt.show()
