In [33]:
import numpy as np
import numpy.linalg as ln

class StreamFeatWeight:
    """
    Streaming update of feature weights at time t
    Input  : Yt: m by nt matrix, 
             B : sketch matrix size m by l
    Output : Feature importance score
    """
   
    def __init__(self, m, k, l=0):
        """
        m : no of features initially
        k : no of singular vectors (this can be the same as the number of clusters in the dataset)
        l : sketch size for a sketched matrix B( m-by-l )
        """

        self.m = m
        self.k = k
        if l < 1: self.l = int(np.sqrt(self.m))
        else: self.l = l

    def low_rank_approximation(self, Yt):
        """
        Calculation of low rank approximation
        sketched matrix B is updated on basis of new inputs at timestep t
        
        :param Yt: m-by-nt input matrix from data stream 
        Yt is the data items introduced at time step t
        
        output: weight of each feature
        """
        #Step 1
        # combine current sketched matrix with input at time t(Yt)
       
        if hasattr(self, 'B'): #(object, name)
            C = np.hstack((self.B, Yt)) # C is m by (n+l) matrix
            n = Yt.shape[1] # it will be n_t
        else:
            # an initial sketch matrix for Y0
            self.B = Yt[:, :self.l]
            C = np.hstack((self.B, Yt[:, self.l:]))
            n = Yt.shape[1] - self.l
            
            print(C)
        
        # Step 2 :Singular value decomposition
        U, s, V = ln.svd(C, full_matrices=False)  #editied at 6 pm, 30 july, uncomment to see it work
        #U, s, V = ln.svd(C)
        U = U[:, :self.l]
        s = s[:self.l]
        V = V[:, :self.l]

        # Step 3 : shrink singular values in Frequent Directions algorithm
       
        delta = s[-1] ** 2 #shrink values on the basis of squared smallest singlar value
        s = np.sqrt(s ** 2 - delta)

        # Step 4 : update sketched matrix B
    
        self.B = np.dot(U, np.diag(s))

        # In Section 5.1, for all experiments,
        # the authors have set alpha = 2^3 * sigma_k based on the pre-experiment
        alpha = (2 ** 3) * s[self.k-1]

        # Step 5: solving the ridge regression by using the top-k singular values
       
        D = np.diag(s[:self.k] / (s[:self.k] ** 2 + alpha))
        #step 6: X: m by k matrix (k <= l)
        X = np.dot(U[:, :self.k], D)
        
        #step 7: returning maximum value of X
        return np.amax(abs(X), axis=1)


In [48]:

#code to access all files in a folder given the location of folder.
#and concatenating all features(from all files) into one result dataframe
import pandas as pd
import numpy as np
import os
#rootdir = 'C:/Users/sid/Desktop/test' 

class central():
    def __init__(self, rootdir):
        self.rootdir = rootdir

#rootdir = 'Data_isha'
    def all_files(self):
        
        #rootdir = rootdir
        file_array = []
        for subdir, dirs, files in os.walk(self.rootdir):
            for file in files:
        #print(os.path.join(subdir, file)) to print names of all the csv files
                file_array.append(os.path.join(subdir, file))
#print(file_array)
        for files in file_array:
            df = ofs(files)
            #return df
            #Commented below 2 lines
            #param_list = sliding_win(df) 
            #return param_list
#files = [f for f in os.listdir('Data_isha') if os.path.isfile(f)]
        '''for files in file_array:
            param_list = slidin_win(files)
        return param_list '''          #print(abc)
#print(result.columns.values)


In [52]:
v = central('Data_isha')
filearray = v.all_files()

[[-0.2748571  -0.2748571  -0.25735982 ..., -0.65979732 -0.60730548
  -0.60730548]
 [-0.23639744 -0.31177241 -0.31177241 ..., -0.31177241 -0.31177241
  -0.31177241]
 [ 1.51809222  0.27081909 -0.03620199 ...,  1.21107114  0.57784017
   1.21107114]
 ..., 
 [ 0.9689295   0.9689295   0.9689295  ..., -1.11646309 -1.11646309
  -1.11646309]
 [ 0.88741587  0.51098582 -0.05365926 ...,  1.37677495  1.74065733
   1.3140366 ]
 [ 1.88847028  1.57418338  0.31703579 ..., -0.9401118  -1.2543987
  -0.9401118 ]]
(31,)
(30984, 6)
[[  5.54852856e-01   5.54852856e-01   5.40473527e-01 ...,   3.71970242e-02
    8.43836692e-03   8.43836692e-03]
 [ -6.11708405e-01  -6.11708405e-01  -6.11708405e-01 ...,  -5.19644992e-01
   -5.19644992e-01  -5.19644992e-01]
 [ -4.47160476e-01  -6.76648450e-01  -5.48135185e-01 ...,  -4.47160476e-01
   -4.47160476e-01  -5.48135185e-01]
 ..., 
 [  5.80922618e-01   5.80922618e-01   5.80922618e-01 ...,  -7.89301688e-01
   -7.89301688e-01  -7.89301688e-01]
 [  4.73374733e-01   4.733747

In [53]:
print(filearray)

None


In [51]:
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as sm
import collections
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as sm
import random
import collections

def ofs(files):
    df=pd.read_csv(files)
    encoder = LabelEncoder()
    df['wdire'] = encoder.fit_transform(df['wdire'])
    df['pressurem']=df['pressurem'].abs()
    df['dewpti']=df['dewpti'].abs()
    df['wspdm']=df['wspdm'].abs()
    df['precip_ratem']=df['precip_ratem'].abs()
    df['hum']=df['hum'].abs()
    df['wdird']=df['wdird'].abs()
    df['wdire']=df['wdire'].abs()
    df['precip_totalm']=df['precip_totalm'].abs()
#when month and day were added, var came out to be around 0.61
    x1 = 'dewpti' #dewpti is okay
    x2 = 'precip_totalm' #okay
    x3 = 'wspdm' #okay
    x4 = 'wgustm' #okay
    x5 = 'precip_ratem' #okay
    x6 = 'pressurem' #presuurem is okay
    x7 = 'hum' #okay
    x8 = 'wdire' #removing this gives values till 14000 rows 
    x9='wdird'
    y = 'tempm'

    df1=df[['dewpti','precip_totalm','wspdm','wgustm','precip_ratem','pressurem','hum','wdird','wdire','tempm']]
#scaler = MinMaxScaler()
#minmax_scaled_df = scaler.fit_transform(df1)
#minmax_scaled_df = pd.DataFrame(minmax_scaled_df, columns=[x1,x2, x3,x5, x6,x7, x8,x9,y])

    scaler = StandardScaler().fit_transform(df1)
    minmax_scaled_df = pd.DataFrame(scaler, columns=[x1,x2, x3,x4, x5,x6, x7,x8,x9,y])
#rescaledX = scaler.transform(X)


#sliding_window_values_sens = []
    X_values=[]
    Y_values=[]

        #iterate over each row of the dataset as individual measurement
    for line in minmax_scaled_df.itertuples():
#for line in df.itertuples():
    
            #extracting the values needed
        values=[getattr(line,x1),getattr(line,x2),getattr(line,x3),getattr(line,x4),getattr(line,x5),getattr(line,x6),getattr(line,x7),getattr(line,x8),getattr(line,x9)]
    #,getattr(line,x9),getattr(line,x10)
        X_values.append(values)
        y_value = [getattr(line,y)]
        Y_values.append(y_value)
        X_array = (np.array(X_values))
        Y_array = (np.array(Y_values))
    #window_data=pd.DataFrame(data)
    #window_data.columns= [x1,x2,x3,x4,x5,x6,x7,x8,y]

    
    
    X=X_array.T
    Y=Y_array.T
    #print(X.shape)
    nt = 1000
    alg2 = StreamFeatWeight(X.shape[0], k=1)
    

    n = 5
    result = []
    for head in range(0, X.shape[1], nt): # each time step
    
        scores = alg2.low_rank_approximation(X[:, head:head+nt])
    #end = time.clock()
    #print(head)
        selected_index = np.argsort(scores)[::-1][:n] #this is correct! #from stackoverflow
    #print(selected_index)
    #result.append(selected_index)
    #print(X[selected_index,head:head+nt].T) #us timestep k saare selected index k values ko print karra hai 'nt elementrs ko'
    #print(X[selected_index,head:head+nt].T.shape)
        result.append(X[selected_index,head:head+nt].T)
    #print(len(result))
    #result

    Xtra = (np.array(result))
#Xtra = (np.array(result[:-1])) #removing the last record because the last record wasn't giving 3D array
    print(Xtra.shape)
    xtra = np.concatenate( Xtra, axis=0 )    
    new_array=np.hstack((xtra,Y_array))
    print(new_array.shape)
    name=['X1','X2','X3','X4','X5','tempm']
    new_df=pd.DataFrame(new_array,columns=name)
    a = slidin_win(files,new_df)
#Xtra = Xtra.reshape((-1,n)) 
    

In [50]:
#sliding window code 
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as sm
import random
import collections
#import pycast.errors.symmetricmeanabsolutepercentageerror.SymmetricMeanAbsolutePercentageError as smp
d=[]
parameters = []
def slidin_win(file,data):
    #Step 2 : Sliding window of 576 measurements over ALL features
    #df = pd.read_csv(filesa)
    #print("filename:",files)
    #df = files
#'dewpti', 'precip_totalm', 'wspdm','wgustm','precip_ratem','pressurem','hum','wdire']]
    files=file
    df2 = data
    x1='X1'
    x2='X2'
    x3='X3'
    x4='X4'
    x5='X5'
    y='tempm'
    

    sliding_window_values_sens = collections.deque(maxlen=576)
        #iterate over each row of the dataset as individual measurement
    for line in df2.itertuples():
    
            #extracting the values needed
        values=[getattr(line,x1),getattr(line,x2),getattr(line,x3),getattr(line,x4),getattr(line,x5),getattr(line,y)]  #extracting the values needed
            #appending window
        sliding_window_values_sens.append(values)
        data = list(sliding_window_values_sens)
        window_data=pd.DataFrame(data)
        window_data.columns= [x1,x2,x3,x4,x5,y]
        query = "window_data['tempm'] ~ window_data[x1] + window_data[x2] + window_data[x3] + window_data[x4] + window_data[x5]" #+x3+x4+x5+x6+x7+x8
        result = sm.ols(formula=query, data=window_data).fit()
        param_sensor=list(result.params)
    parameters.append(param_sensor)
        
        #here take parameters and append them to array later return all parameters when instance of central class will be called
        #from 
        
    y = window_data['tempm']
    #x1 = window_data['dewpti']
    #x2 = window_data['precip_totalm']
    #x3 = window_data['wspdm']
    #x4 = window_data['wgustm']
    #x5 = window_data['precip_ratem']
    #x6 = window_data['pressurem']
    #x7 = window_data['hum']
    #x8 = window_data['wdire']
    df3= window_data[['X1','X2','X3','X4','X5']]
    
    c=np.arange(0.1, 1.0, 0.1)
    a = random.sample(list(c), 6)

    for i in range(1,6):
        #print("Iteration:",i)
        X = df3
        y = window_data['tempm']
        #ran = round(random.uniform(0.1,0.9), 1)
        #print("random number", ran)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=a[i], random_state=42)

# Create linear regression object
        regr = linear_model.LinearRegression()

# Train the model using the training sets
        regr.fit(X_train, y_train)

# Make predictions using the testing set
        y_pred = regr.predict(X_test)


# The coefficients
        coeff_array=[] #trial
        coeff = regr.coef_
        coeff_array.append(regr.coef_) #trial
       # print('Coefficients: \n', regr.coef_)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test,y_pred)
        rmse = sqrt(mean_squared_error(y_test,y_pred))
        r2=r2_score(y_test, y_pred)
        #smap = smp.local_error(y_test,y_pred)
        
# Explained variance score: 1 is perfect prediction
        #print('Variance score: %.2f' % r2_score(y_test, y_pred))
        values= [files,i,a[i],mae,mse,rmse,r2,coeff]
        d.append(values)
        param.append(coeff_array[4])

#print(d)
#print("Parameters",parameters)

    #return coeff
    return param #not yet tested

In [35]:
values= ['file','i','testsize','mae','mse','rmse','r2','coeff'] #nt=2000
#e = d.
d_f1 = pd.DataFrame(d, columns = values)
#print(df.coeff==5)
d_f1

Unnamed: 0,file,i,testsize,mae,mse,rmse,r2,coeff
0,Data_isha\IABERDEE25_final.csv,1,0.8,0.405347,0.207136,0.455122,0.735903,"[6.00921216652, 0.0, -0.670138867587]"
1,Data_isha\IABERDEE25_final.csv,2,0.4,0.441249,0.239161,0.489041,0.690266,"[5.94672083742, 0.0, -0.671430910557]"
2,Data_isha\IABERDEE25_final.csv,3,0.6,0.413558,0.218017,0.466923,0.718438,"[5.65376574059, 0.0, -0.667550727769]"
3,Data_isha\IABERDEE25_final.csv,4,0.3,0.445269,0.24489,0.494864,0.708066,"[5.79582462803, 0.0, -0.671275264458]"
4,Data_isha\IABERDEE25_final.csv,5,0.1,0.45236,0.250073,0.500073,0.660387,"[6.08527809779, 0.0, -0.69580029565]"
5,Data_isha\ICORNWAL26_final.csv,1,0.4,0.304799,0.185381,0.430558,0.704641,"[0.718839091482, 0.0457919641727, 0.13641653411]"
6,Data_isha\ICORNWAL26_final.csv,2,0.6,0.297581,0.178576,0.422583,0.710541,"[0.705445923457, 0.113000884814, 0.0766465809332]"
7,Data_isha\ICORNWAL26_final.csv,3,0.5,0.31173,0.190646,0.43663,0.699997,"[0.711028315733, 0.0915261481433, 0.102214949906]"
8,Data_isha\ICORNWAL26_final.csv,4,0.2,0.301975,0.181824,0.426408,0.729625,"[0.736053009556, 0.0608000675632, 0.112148372672]"
9,Data_isha\ICORNWAL26_final.csv,5,0.7,0.288514,0.168218,0.410144,0.716147,"[0.726948363635, 0.177896482182, 0.0158351601495]"


In [54]:
#nt=1000
values= ['file','i','testsize','mae','mse','rmse','r2','coeff'] #nt=2000
#e = d.
df_1 = pd.DataFrame(d, columns = values)
#print(df.coeff==5)
df_1

Unnamed: 0,file,i,testsize,mae,mse,rmse,r2,coeff
0,Data_isha\IABERDEE25_final.csv,1,0.7,0.024818,0.000917,0.030283,0.998813,"[0.00609735614708, -8.881784197e-16, -0.496988..."
1,Data_isha\IABERDEE25_final.csv,2,0.6,0.024762,0.000909,0.030152,0.998826,"[-0.0261119315788, 3.33066907388e-16, -0.49611..."
2,Data_isha\IABERDEE25_final.csv,3,0.1,0.024098,0.000871,0.029513,0.998817,"[-0.028312080722, -2.77555756156e-16, -0.49388..."
3,Data_isha\IABERDEE25_final.csv,4,0.5,0.024583,0.000913,0.030215,0.998807,"[-0.0214629677563, -5.55111512313e-17, -0.4960..."
4,Data_isha\IABERDEE25_final.csv,5,0.9,0.026052,0.000989,0.031444,0.998717,"[-0.0924693760419, 0.0, -0.50066679614, -0.074..."
5,Data_isha\ICORNWAL26_final.csv,1,0.4,0.01668,0.000449,0.021185,0.999285,"[1.13758187648, -0.0046971931955, 0.0172610621..."
6,Data_isha\ICORNWAL26_final.csv,2,0.3,0.016888,0.000455,0.021326,0.999298,"[1.13882366201, -0.0035344329393, 0.0147936813..."
7,Data_isha\ICORNWAL26_final.csv,3,0.9,0.016034,0.00044,0.020976,0.999266,"[1.135800379, 0.00694188052926, 0.007819907863..."
8,Data_isha\ICORNWAL26_final.csv,4,0.8,0.016214,0.000425,0.020615,0.999293,"[1.13672871791, -0.00522083002946, 0.013409336..."
9,Data_isha\ICORNWAL26_final.csv,5,0.2,0.016804,0.000446,0.021107,0.999338,"[1.13850357826, -0.00396023717073, 0.014435484..."


In [55]:
#iter1
print(df_1[['rmse']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))
print(df_1[['r2']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))
print(df_1[['mse']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))
print(df_1[['mae']].iloc[[0,5,10,15,20,25,30,35]].mean(axis=0))

rmse    0.151043
dtype: float64
r2    0.833015
dtype: float64
mse    0.052152
dtype: float64
mae    0.120805
dtype: float64


In [56]:
#iter2
print(df_1[['rmse']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))
print(df_1[['r2']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))
print(df_1[['mse']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))
print(df_1[['mae']].iloc[[1,6,11,16,21,26,31,36]].mean(axis=0))

rmse    0.149377
dtype: float64
r2    0.838512
dtype: float64
mse    0.0498
dtype: float64
mae    0.118383
dtype: float64


In [57]:
#iter3
print(df_1[['rmse']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))
print(df_1[['r2']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))
print(df_1[['mse']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))
print(df_1[['mae']].iloc[[2,7,12,17,22,27,32,37]].mean(axis=0))

rmse    0.149193
dtype: float64
r2    0.825297
dtype: float64
mse    0.049309
dtype: float64
mae    0.118535
dtype: float64


In [58]:
#iter4
print(df_1[['rmse']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))
print(df_1[['r2']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))
print(df_1[['mse']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))
print(df_1[['mae']].iloc[[3,8,13,18,23,28,33,38]].mean(axis=0))

rmse    0.149537
dtype: float64
r2    0.824821
dtype: float64
mse    0.048766
dtype: float64
mae    0.11885
dtype: float64


In [59]:
#iter5
print(df_1[['rmse']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))
print(df_1[['r2']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))
print(df_1[['mse']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))
print(df_1[['mae']].iloc[[4,9,14,19,24,29,34,39]].mean(axis=0))

rmse    0.147524
dtype: float64
r2    0.840631
dtype: float64
mse    0.048373
dtype: float64
mae    0.117332
dtype: float64
