In [28]:
import numpy as np
from collections import defaultdict
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

In [29]:
def generate_data_file(filename, n_samples=1000, n_features=5):

    X, y, coef = make_regression(n_samples=n_samples,
                                 n_features=n_features, 
                                 noise=2,
                                 coef=True,
                                 random_state=42)
     
    with open(filename, 'w') as f:
        for i in range(n_samples):
            line = f"{y[i]}," + ",".join(map(str, X[i]))
            f.write(line + "\n")

    print(f"коэф: {coef}")
    return n_features, coef

In [30]:
def map_function(line_content):
    # первое число Y, остальные X
    parts = list(map(float, line_content.strip().split(',')))
    y_sample = parts[0]
    x_sample = np.array(parts[1:])
    
    # добавление bias в матрицу вектора признаков
    x_augmented = np.insert(x_sample, 0, 1.0)
    
    # (x * x.T) часть итоговой матрицы (X.T * X)
    xt_x_partial = np.outer(x_augmented, x_augmented)
    
    # (x * y) часть итогового вектора (X.T * y)
    xt_y_partial = x_augmented * y_sample
    return [('summary', (xt_x_partial, xt_y_partial))]

In [31]:
def shuffle_function(mapped_data):
    grouped = defaultdict(list)
    for key, value in mapped_data:
        grouped[key].append(value)
    return grouped

In [32]:
def reduce_function(key, list_of_values):
    # суммируем частичные матрицы (X.T * X) и векторы (X.T * y).
    first_xt_x, first_xt_y = list_of_values[0]
    
    total_xt_x = np.zeros_like(first_xt_x)
    total_xt_y = np.zeros_like(first_xt_y)

    for xt_x_part, xt_y_part in list_of_values:
        total_xt_x += xt_x_part
        total_xt_y += xt_y_part
        
    return total_xt_x, total_xt_y

In [33]:
generate_data_file('test')

коэф: [28.62798621 46.07121713 16.82365791 24.74629812 18.99347437]


(5, array([28.62798621, 46.07121713, 16.82365791, 24.74629812, 18.99347437]))

In [34]:
def map_reduce_lr(filename):
    all_mapped = []

    with open(filename, 'r') as f:
        for line in f:
            all_mapped.extend(map_function(line))

    grouped = shuffle_function(all_mapped)
    final_xt_x, final_xt_y = reduce_function('summary', grouped['summary'])
    
    # решение системы: (X'X) * theta = (X'y)
    # theta = inv(X'X) * (X'y)
    try:
        theta = np.linalg.solve(final_xt_x, final_xt_y)
    except np.linalg.LinAlgError: # если матрица сингулярная то берем псевдообратную
        theta = np.linalg.pinv(final_xt_x) @ final_xt_y
        
    return theta

In [35]:
map_reduce_lr('test')

array([-0.12342932, 28.51388702, 46.04528284, 16.78130221, 24.703425  ,
       19.06856614])

In [36]:
data = np.loadtxt('test', delimiter=',')
y = data[:, 0]
X = data[:, 1:]

model = LinearRegression()
model.fit(X, y)

print(np.insert(model.coef_, 0, model.intercept_))

[-0.12342932 28.51388702 46.04528284 16.78130221 24.703425   19.06856614]
