In [2]:
import numpy as np
import pandas as pd
import tqdm as tqdm


In [3]:
def transformMatrix(A):
    return np.vstack([np.outer(r, r).flatten() for r in A])



# cache matrix-nullSpaceSolution to avoid recomputation
matrix_nullSpaceSolution = {}

def findNullSpace(A):
    # flatten tupple A
    B=  A.flatten()
    if tuple(B) in matrix_nullSpaceSolution:
        return matrix_nullSpaceSolution[tuple(B)]
    # Compute the SVD of A
    U, S, Vt = np.linalg.svd(A)

    # Find the null space of A
    null_space = Vt.T[:, S.size:]

    # Choose a non-trivial vector from the null space
    non_trivial_solution = null_space[:, 0]
    # Cache the result
    matrix_nullSpaceSolution[tuple(B)] = non_trivial_solution
    return non_trivial_solution
    # return Vt[-1]




def caratheodory(P, w):
    assert len(P) == len(w)
    n, d = P.shape
    if n <= d -5 :
        return P, w
    weigtedSumOfPoints = np.dot(w, P)
    # Compute the matrix M (each column is pi - p1)
    M = (P[1:] - P[0]).T  # shape (d, n-1)
    v= findNullSpace(M)   
    v1 = -np.sum(v)
    v = np.insert(v, 0, v1)

    alpha = np.inf
    for i in range(n):
        if v[i] > 0:
            alpha = min(alpha, w[i] / v[i])
    u = w - alpha * v
    assert np.all(u >= -0.0000001)
    S = P[u > 0]
    u = u[u > 0]
    weightedSumOfPoints2 = np.dot(u, S)
    assert np.allclose(weigtedSumOfPoints, weightedSumOfPoints2)
    if len(S) > d + 1:
        return caratheodory(S, u)  # Recursive call if needed
    return S, u


# write the function for k-streaming algorithm based caraheodory algorithm (take d+2 points and keep reducing the points)
import numpy as np

def streaming_caratheodory(P, w, d):

    n = len(P)
    selected_points = P[:d+1]
    selected_weights = w[:d+1]
    
    for i in range(d+1, n):

        selected_points = np.vstack((selected_points, P[i]))
        selected_weights = np.append(selected_weights, w[i])
        
        # Apply Caratheodory's theorem to reduce the set to d+1 points
        selected_points, selected_weights = caratheodory(np.array(selected_points), np.array(selected_weights))
    
    return selected_points, selected_weights

In [5]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 
  
# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 
  
# metadata 
print(concrete_compressive_strength.metadata) 
  
# variable information 
print(concrete_compressive_strength.variables) 


ConnectionError: Error connecting to server

In [None]:
X

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360
...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28


In [None]:
y

Unnamed: 0,Concrete compressive strength
0,79.99
1,61.89
2,40.27
3,41.05
4,44.30
...,...
1025,44.28
1026,31.18
1027,23.70
1028,32.77


In [None]:
#  concatinate copies of X and y to itself 10 times
X=pd.DataFrame(X)
y=pd.DataFrame(y)
X = pd.concat([X]*100, ignore_index=True)
y = pd.concat([y]*100, ignore_index=True)
print(len(X))

103000


In [None]:
#  train a regularized linear regression model
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# measure the training time
#  do this 10 times  and take the average time
avg_time1 = 0
for i in range(100):

    start = time.time()

    model = Ridge(alpha=1.0)
    model.fit(X, y)
    stop=time.time()

    y_pred = model.predict(X_test)
    print(mean_squared_error(y_test, y_pred))
    #print accuracy
    print(model.score(X_test, y_test))

    print("Training time: ", stop-start)
    avg_time1 += stop-start
avg_time1 = avg_time1/100
print("Average training time: ", avg_time1)

110.68448819714968
0.6143743166828779
Training time:  0.0004444122314453125
110.68448819714968
0.6143743166828779
Training time:  0.00655055046081543
110.68448819714968
0.6143743166828779
Training time:  0.0
110.68448819714968
0.6143743166828779
Training time:  0.0
110.68448819714968
0.6143743166828779
Training time:  0.0035974979400634766
110.68448819714968
0.6143743166828779
Training time:  0.0
110.68448819714968
0.6143743166828779
Training time:  0.0
110.68448819714968
0.6143743166828779
Training time:  0.008016347885131836
110.68448819714968
0.6143743166828779
Training time:  0.00578761100769043
110.68448819714968
0.6143743166828779
Training time:  0.0
110.68448819714968
0.6143743166828779
Training time:  0.0035104751586914062
110.68448819714968
0.6143743166828779
Training time:  0.0034437179565429688
110.68448819714968
0.6143743166828779
Training time:  0.007347822189331055
110.68448819714968
0.6143743166828779
Training time:  0.0
110.68448819714968
0.6143743166828779
Training tim

In [None]:
# make a matrix by combining X and y
Xy = np.hstack((X, y))

# use transformation and then calculate the caratheodory set
# measure time
start = time.time()

Xy_transformed = transformMatrix(Xy)
caratheodory_set, weights = streaming_caratheodory(Xy_transformed, np.ones(len(Xy_transformed)), 8)

stop = time.time()

print("Caratheodory set size: ", len(caratheodory_set))
print("Time taken: ", stop-start)



Caratheodory set size:  82
Time taken:  91.59673881530762


In [None]:
print(len(caratheodory_set))

82


In [None]:
#  check which rows of Xy_transformed are in the caratheodory set by finding the index of the rows in the caratheodory set
index = []
for i in range(len(Xy_transformed)):
    if np.all(np.isin(Xy_transformed[i], caratheodory_set)):
        index.append(i)
#  use the index to get the corresponding rows of Xy
new_Xy = Xy[index]

In [None]:
# remove duplicates
new_Xy = np.unique(new_Xy, axis=0)

In [None]:
print("Number of rows in the caratheodory set: ", len(new_Xy))
print("Number of rows in the caratheodory set: ", len(Xy))

Number of rows in the caratheodory set:  67
Number of rows in the caratheodory set:  8400


In [None]:
# make a matrix by combining X and y
Xy = np.hstack((X, y))

# use transformation and then calculate the caratheodory set
# measure time
start = time.time()

Xy_transformed = transformMatrix(Xy)
caratheodory_set, weights = streaming_caratheodory(Xy_transformed, np.ones(len(Xy_transformed)), 8)

stop = time.time()

print("Caratheodory set size: ", len(caratheodory_set))
print("Time taken: ", stop-start)



ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [None]:
#  train a regularized linear regression model
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# measure the training time
#  do this 10 times  and take the average time
print(len(X))
avg_time1 = 0
for i in range(100):

    start = time.time()

    model = Ridge(alpha=1.0)
    model.fit(X, y)
    stop=time.time()

    y_pred = model.predict(X_test)
    # print(mean_squared_error(y_test, y_pred))
    #print accuracy
    # print(model.score(X_test, y_test))

    # print("Training time: ", stop-start)
    avg_time1 += stop-start
avg_time1 = avg_time1/100
print("Average training time: ", avg_time1)



#  train a regularized linear regression model
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# train on whole data

X = new_Xy[:, :-1]
y = new_Xy[:, -1]
print(len(X))
avg_time = 0
for i in range(100):

    start = time.time()

    model = Ridge(alpha=1.0)
    model.fit(X, y)
    stop=time.time()

    y_pred = model.predict(X_test)
    # print(mean_squared_error(y_test, y_pred))
    #print accuracy
    # print(model.score(X_test, y_test))

    # print("Training time: ", stop-start)
    avg_time += stop-start
avg_time = avg_time/100

print("Average training time: ", avg_time)
# print the ratio of average times
print("Ratio of average times: ", avg_time1/(avg_time))


103000
Average training time:  0.013695008754730224
67




Average training time:  0.0015790247917175293
Ratio of average times:  8.673080262301617


