Evaluation Point 6: Adaptive linear regression

In [11]:
import numpy as np
import numpy.linalg as ln

class StreamFeatWeight:
    """
    Streaming update of feature weights at time t
    Input  : Yt: m by nt matrix, 
             B : sketch matrix size m by l
    Output : Feature importance score
    """
   
    def __init__(self, m, k, l=0):
        """
        m : no of features initially
        k : no of singular vectors (this can be the same as the number of clusters in the dataset)
        l : sketch size for a sketched matrix B( m-by-l )
        """

        self.m = m
        self.k = k
        if l < 1: self.l = int(np.sqrt(self.m))
        else: self.l = l

    def low_rank_approximation(self, Yt):
        """
        Calculation of low rank approximation
        sketched matrix B is updated on basis of new inputs at timestep t
        
        :param Yt: m-by-nt input matrix from data stream 
        Yt is the data items introduced at time step t
        
        output: weight of each feature
        """
        #Step 1
        # combine current sketched matrix with input at time t(Yt)
       
        if hasattr(self, 'B'): #(object, name)
            C = np.hstack((self.B, Yt)) # C is m by (n+l) matrix
            n = Yt.shape[1] # it will be n_t
        else:
            # an initial sketch matrix for Y0
            self.B = Yt[:, :self.l]
            C = np.hstack((self.B, Yt[:, self.l:]))
            n = Yt.shape[1] - self.l
            
            print(C)
        
        # Step 2 :Singular value decomposition
        U, s, V = ln.svd(C, full_matrices=False)  #editied at 6 pm, 30 july, uncomment to see it work
        #U, s, V = ln.svd(C)
        U = U[:, :self.l]
        s = s[:self.l]
        V = V[:, :self.l]

        # Step 3 : shrink singular values in Frequent Directions algorithm
       
        delta = s[-1] ** 2 #shrink values on the basis of squared smallest singlar value
        s = np.sqrt(s ** 2 - delta)

        # Step 4 : update sketched matrix B
    
        self.B = np.dot(U, np.diag(s))

        # In Section 5.1, for all experiments,
        # the authors have set alpha = 2^3 * sigma_k based on the pre-experiment
        alpha = (2 ** 3) * s[self.k-1]

        # Step 5: solving the ridge regression by using the top-k singular values
       
        D = np.diag(s[:self.k] / (s[:self.k] ** 2 + alpha))
        #step 6: X: m by k matrix (k <= l)
        X = np.dot(U[:, :self.k], D)
        
        #step 7: returning maximum value of X
        return np.amax(abs(X), axis=1)


In [12]:

#code to access all files in a folder given the location of folder.
#and concatenating all features(from all files) into one result dataframe
import pandas as pd
import numpy as np
import os
#rootdir = 'C:/Users/sid/Desktop/test' 

class central():
    def __init__(self, rootdir):
        self.rootdir = rootdir

#rootdir = 'Data_isha'
    def all_files(self):
        
        #rootdir = rootdir
        file_array = []
        for subdir, dirs, files in os.walk(self.rootdir):
            for file in files:
       
                file_array.append(os.path.join(subdir, file))

        for files in file_array:
            param_list = ofs(files)      #online_reg1 for train-test split and validation;online_reg for implementing online linear regression
        return param_list        


In [16]:
v = central('Data_isha')
filearray = v.all_files()

[[-0.2748571  -0.2748571  -0.25735982 ..., -0.65979732 -0.60730548
  -0.60730548]
 [-0.23639744 -0.31177241 -0.31177241 ..., -0.31177241 -0.31177241
  -0.31177241]
 [ 1.51809222  0.27081909 -0.03620199 ...,  1.21107114  0.57784017
   1.21107114]
 ..., 
 [ 0.9689295   0.9689295   0.9689295  ..., -1.11646309 -1.11646309
  -1.11646309]
 [ 0.88741587  0.51098582 -0.05365926 ...,  1.37677495  1.74065733
   1.3140366 ]
 [ 1.88847028  1.57418338  0.31703579 ..., -0.9401118  -1.2543987
  -0.9401118 ]]
(31,)
(30984, 6)
[[  5.54852856e-01   5.54852856e-01   5.40473527e-01 ...,   3.71970242e-02
    8.43836692e-03   8.43836692e-03]
 [ -6.11708405e-01  -6.11708405e-01  -6.11708405e-01 ...,  -5.19644992e-01
   -5.19644992e-01  -5.19644992e-01]
 [ -4.47160476e-01  -6.76648450e-01  -5.48135185e-01 ...,  -4.47160476e-01
   -4.47160476e-01  -5.48135185e-01]
 ..., 
 [  5.80922618e-01   5.80922618e-01   5.80922618e-01 ...,  -7.89301688e-01
   -7.89301688e-01  -7.89301688e-01]
 [  4.73374733e-01   4.733747

In [15]:
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as sm
import collections
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as sm
import random
import collections

def ofs(files):
    df=pd.read_csv(files)
    encoder = LabelEncoder()
    df['wdire'] = encoder.fit_transform(df['wdire'])
    df['pressurem']=df['pressurem'].abs()
    df['dewpti']=df['dewpti'].abs()
    df['wspdm']=df['wspdm'].abs()
    df['precip_ratem']=df['precip_ratem'].abs()
    df['hum']=df['hum'].abs()
    df['wdird']=df['wdird'].abs()
    df['wdire']=df['wdire'].abs()
    df['precip_totalm']=df['precip_totalm'].abs()
#when month and day were added, var came out to be around 0.61
    x1 = 'dewpti' #dewpti is okay
    x2 = 'precip_totalm' #okay
    x3 = 'wspdm' #okay
    x4 = 'wgustm' #okay
    x5 = 'precip_ratem' #okay
    x6 = 'pressurem' #presuurem is okay
    x7 = 'hum' #okay
    x8 = 'wdire' #removing this gives values till 14000 rows 
    x9='wdird'
    y = 'tempm'

    df1=df[['dewpti','precip_totalm','wspdm','wgustm','precip_ratem','pressurem','hum','wdird','wdire','tempm']]
#scaler = MinMaxScaler()
#minmax_scaled_df = scaler.fit_transform(df1)
#minmax_scaled_df = pd.DataFrame(minmax_scaled_df, columns=[x1,x2, x3,x5, x6,x7, x8,x9,y])

    scaler = StandardScaler().fit_transform(df1)
    minmax_scaled_df = pd.DataFrame(scaler, columns=[x1,x2, x3,x4, x5,x6, x7,x8,x9,y])
#rescaledX = scaler.transform(X)


#sliding_window_values_sens = []
    X_values=[]
    Y_values=[]

        #iterate over each row of the dataset as individual measurement
    for line in minmax_scaled_df.itertuples():
#for line in df.itertuples():
    
            #extracting the values needed
        values=[getattr(line,x1),getattr(line,x2),getattr(line,x3),getattr(line,x4),getattr(line,x5),getattr(line,x6),getattr(line,x7),getattr(line,x8),getattr(line,x9)]
    #,getattr(line,x9),getattr(line,x10)
        X_values.append(values)
        y_value = [getattr(line,y)]
        Y_values.append(y_value)
        X_array = (np.array(X_values))
        Y_array = (np.array(Y_values))
    #window_data=pd.DataFrame(data)
    #window_data.columns= [x1,x2,x3,x4,x5,x6,x7,x8,y]

    
    
    X=X_array.T
    Y=Y_array.T
    #print(X.shape)
    nt = 1000
    alg2 = StreamFeatWeight(X.shape[0], k=1)
    

    n = 5
    result = []
    for head in range(0, X.shape[1], nt): # each time step
    
        scores = alg2.low_rank_approximation(X[:, head:head+nt])
    #end = time.clock()
    #print(head)
        selected_index = np.argsort(scores)[::-1][:n] #this is correct! #from stackoverflow
    #print(selected_index)
    #result.append(selected_index)
    #print(X[selected_index,head:head+nt].T) #us timestep k saare selected index k values ko print karra hai 'nt elementrs ko'
    #print(X[selected_index,head:head+nt].T.shape)
        result.append(X[selected_index,head:head+nt].T)
    #print(len(result))
    #result

    Xtra = np.array(result)
#Xtra = (np.array(result[:-1])) #removing the last record because the last record wasn't giving 3D array
    print(Xtra.shape)
    xtra = np.concatenate( Xtra, axis=0 )    
    new_array=np.hstack((xtra,Y_array))
    print(new_array.shape)
    name=['X1','X2','X3','X4','X5','tempm']
    new_df=pd.DataFrame(new_array,columns=name)
    a = online_reg1(files,new_df)
#Xtra = Xtra.reshape((-1,n)) 
    
    

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as sm
import random
import collections
#import pycast.errors.symmetricmeanabsolutepercentageerror.SymmetricMeanAbsolutePercentageError as smp
d=[]
parameters = []
def online_reg1(file,data):
    #Step 2 : Sliding window of 576 measurements over ALL features
    df = data
    files=file
    #print("filename:",files)
    #df = files
#'dewpti', 'precip_totalm', 'wspdm','wgustm','precip_ratem','pressurem','hum','wdire']]
    
    x1 = 'X1'
    x2='X2'
    x3='X3'
    x4='X4'
    x5='X5'
    y = 'tempm'
    xvalues =[]
   
        #iterate over each row of the dataset as individual measurement
    for line in df.itertuples():
        values1 =[getattr(line,x1),getattr(line,x2),getattr(line,x3),getattr(line,x4),getattr(line,x5),getattr(line,y)]
        xvalues.append(values1)
        data = list(xvalues)
        window_data=pd.DataFrame(data)
        window_data.columns= [x1,x2,x3,x4,x5,y]
       
    df2 = window_data[['X1','X2','X3','X4','X5']]
    
    c=np.arange(0.1, 1.0, 0.1)
    a = random.sample(list(c), 6)

    for i in range(1,6):
        #print("Iteration:",i)
        X = df2
        y = window_data['tempm']
        #ran = round(random.uniform(0.1,0.9), 1)
        #print("random number", ran)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=a[i], random_state=42)

# Create linear regression object
        regr = linear_model.LinearRegression()

# Train the model using the training sets
        regr.fit(X_train, y_train)

# Make predictions using the testing set
        y_pred = regr.predict(X_test)


# The coefficients
        coeff = regr.coef_
       # print('Coefficients: \n', regr.coef_)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test,y_pred)
        rmse = sqrt(mean_squared_error(y_test,y_pred))
        r2=r2_score(y_test, y_pred)
      
        values= [files,i,mae,mse,rmse,r2,coeff]
        d.append(values)

  

In [17]:
values= ['files','i','mae','mse','rmse','r2','coeff']
#e = d.
df1 = pd.DataFrame(d, columns = values)
#print(df.coeff==5)
df1

Unnamed: 0,files,i,mae,mse,rmse,r2,coeff
0,Data_isha\IABERDEE25_final.csv,1,0.687784,0.780263,0.883325,0.224614,"[0.269257104727, -0.0101726854834, 0.053492820..."
1,Data_isha\IABERDEE25_final.csv,2,0.684957,0.773388,0.879425,0.230435,"[0.302913240598, -0.0362445226243, 0.040368614..."
2,Data_isha\IABERDEE25_final.csv,3,0.682928,0.779475,0.882879,0.229441,"[0.341482038057, -0.0615127300768, 0.047944388..."
3,Data_isha\IABERDEE25_final.csv,4,0.68594,0.779735,0.883026,0.230039,"[0.331073051901, -0.0555556362305, 0.046761396..."
4,Data_isha\IABERDEE25_final.csv,5,0.686395,0.782167,0.884402,0.228181,"[0.336318821093, -0.0592137606676, 0.045953163..."
5,Data_isha\ICORNWAL26_final.csv,1,0.504284,0.434269,0.658991,0.567826,"[0.554443202204, -0.0540231565267, 0.243931862..."
6,Data_isha\ICORNWAL26_final.csv,2,0.500015,0.430141,0.655852,0.567448,"[0.557823265867, -0.0588629970874, 0.250713286..."
7,Data_isha\ICORNWAL26_final.csv,3,0.497668,0.427442,0.65379,0.564263,"[0.559579882033, -0.0573780844374, 0.240384889..."
8,Data_isha\ICORNWAL26_final.csv,4,0.500105,0.427255,0.653648,0.566668,"[0.559444259032, -0.057437053275, 0.2461886402..."
9,Data_isha\ICORNWAL26_final.csv,5,0.504977,0.435234,0.659723,0.567743,"[0.557123448764, -0.0635172863435, 0.251802862..."


In [18]:
#iter1
print(df1[['rmse']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))
print(df1[['r2']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))
print(df1[['mse']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))
print(df1[['mae']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))

rmse    0.764978
dtype: float64
r2    0.410198
dtype: float64
mse    0.591142
dtype: float64
mae    0.587453
dtype: float64


In [19]:
#iter2
print(df1[['rmse']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))
print(df1[['r2']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))
print(df1[['mse']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))
print(df1[['mae']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))

rmse    0.760989
dtype: float64
r2    0.416194
dtype: float64
mse    0.585274
dtype: float64
mae    0.588837
dtype: float64


In [20]:
#iter3
print(df1[['rmse']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))
print(df1[['r2']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))
print(df1[['mse']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))
print(df1[['mae']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))

rmse    0.763392
dtype: float64
r2    0.413375
dtype: float64
mse    0.588688
dtype: float64
mae    0.586775
dtype: float64


In [21]:
#iter4
print(df1[['rmse']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))
print(df1[['r2']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))
print(df1[['mse']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))
print(df1[['mae']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))

rmse    0.767638
dtype: float64
r2    0.408175
dtype: float64
mse    0.594831
dtype: float64
mae    0.589364
dtype: float64


In [22]:
#iter5
print(df1[['rmse']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))
print(df1[['r2']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))
print(df1[['mse']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))
print(df1[['mae']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))

rmse    0.765548
dtype: float64
r2    0.411182
dtype: float64
mse    0.591665
dtype: float64
mae    0.587417
dtype: float64


In [14]:
#partial fit regressor
import statsmodels.formula.api as sm
from sklearn import linear_model
import collections
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as sm
import random
import collections

param= []
def online_reg(file,data):
    df = data
    files=file
    x1 = 'X1'
    x2='X2'
    x3='X3'
    x4='X4'
    x5='X5'
    y = 'tempm'
    #print(files)
    for line in df.itertuples():
        values=[getattr(line,x1),getattr(line,x2),getattr(line,x3),getattr(line,x4),getattr(line,x5)]
        y_value = [getattr(line,y)]
        X = (np.array(values)).reshape(1,5)
        Y = (np.array(y_value)).reshape(1,)
    
    #model = linear_model.SGDRegressor(learning_rate = 'constant', eta0 = 0.1, shuffle = False, n_iter = 1)
        model = linear_model.SGDRegressor() #simple sgdregressor with default parameters
        model.partial_fit(X,Y)
        param_sensor= model.coef_
    #print(param_sensor)
    param.append(param_sensor)
    
    #for i
    #print(param)
    return param
    #print(model.coef_)
    #print(model.intercept_)
#print(model.coef_)
#print(model.intercept_)
    
    