# [Team 117] Proj-C: Terrain Identification from Time Series Data

### Kartik Rawool (khrawool)
### Kartiki Bhandakkar (kbhanda3)
### Subodh Gujar (sgujar)

## Connecting to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Checking Configuration

In [None]:
!nvidia-smi

Mon Apr 24 02:03:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

##Importing libraries

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats as st
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix

In [None]:
#read processed data
data_path = "/content/drive/MyDrive/ECE542_sp2022_Project_TerrainRecognition/new_data/TrainingData/session_0{}_x.csv"
y_path = "/content/drive/MyDrive/ECE542_sp2022_Project_TerrainRecognition/new_data/TrainingData/session_0{}_y.csv"


### Loading all train data and validation data

In [None]:
train_x = []
train_y = []

for i in range(27):
    train_x.append(pd.read_csv(data_path.format(i), header=None))
    train_y.append(pd.read_csv(y_path.format(i), header=None))

# val_x = []
# val_y = []
# for i in range(25,27):
#     val_x.append(pd.read_csv(data_path.format(i), header=None))
#     val_y.append(pd.read_csv(y_path.format(i), header=None))

test_x = []
test_y = []
for i in range(27,29):
    test_x.append(pd.read_csv(data_path.format(i), header=None))
    test_y.append(pd.read_csv(y_path.format(i), header=None))


In [None]:
sub1_x = []
sub1_y = []
for i in range(0,1):
    sub1_x.append(pd.read_csv(data_path.format(i), header=None))
    sub1_y.append(pd.read_csv(y_path.format(i), header=None))

### Loading test data

In [None]:
test_x_path = "/content/drive/MyDrive/ECE542_sp2022_Project_TerrainRecognition/new_data/TestData/session_{}_x.csv"
test_y_path = "/content/drive/MyDrive/ECE542_sp2022_Project_TerrainRecognition/new_data/TestData/session_{}_y.csv"
user = ['009', '010', '011', '012']

In [None]:
TEST_DATA= []
TEST_TIME = []

for i in range(4):
    TEST_DATA.append(pd.read_csv(test_x_path.format(user[i]), header=None))
    TEST_TIME.append(pd.read_csv(test_y_path.format(user[i]), header=None))

In [None]:
#Test rows
for i in range(0,4):
  print(len(TEST_DATA[i].index))

37991
49081
51761
45319


In [None]:
# for i in range(29):
#   print(len(data_x[i].index))

## Upsampling of data

As X is generated based on 40Hz and output is generated based on 10Hz, we need to upsample Y so that number of X data points matches with Y outputs.

In [None]:
def upsample_data(data_x, data_y):
  combined_x_t = []
  combined_y_t = []
  for x, y in zip(data_x, data_y):
      counter = 0
      for i in range(len(x)):
          if counter == len(y)-1:
              combined_x_t.append(x.iloc[i, :6])
              combined_y_t.append(y.iloc[counter,0])
              continue
          if abs(x.iloc[i,6] - y.iloc[counter, 1]) < abs(x.iloc[i,6] - y.iloc[counter+1, 1]):
              combined_x_t.append(x.iloc[i, :6])
              combined_y_t.append(y.iloc[counter,0])

          else:
              counter += 1
              combined_x_t.append(x.iloc[i, :6])
              combined_y_t.append(y.iloc[counter,0])

  return combined_x_t, combined_y_t


### Normalizing data

In [None]:
def scale_data(data):
  scalar = StandardScaler()
  scalar = scalar.fit(data)
  data.loc[:,:] = scalar.transform(data.to_numpy())
  return data

## Windowing data
 As this is time series data, we need some values and outputs of previous datapoints. We are considering 80 samples i.e. 2 seconds before given data point.

In [None]:
def get_windowed_data(data_x, data_y, sequence_len, step_size):
  X, y = upsample_data(data_x, data_y)
  X = pd.DataFrame(X).reset_index(drop=True)
  y = pd.DataFrame(y)
  X = scale_data(X)
  windowed_X = []
  windowed_y = []
  for i in range(0, len(X) - sequence_len, step_size):
    values = X.iloc[i:(i+sequence_len)].to_numpy()
    windowed_X.append(values)
    labels = y.iloc[i:(i+sequence_len)].to_numpy()
    windowed_y.append(st.mode(labels)[0][0][0])
  
  return np.array(windowed_X), np.array(windowed_y)


## One Hot Encoding for Y

In [None]:
def one_hot_encode_data(label):
  encoder = OneHotEncoder()
  one_hot_data = encoder.fit_transform(np.array(label).reshape(-1, 1)).toarray()
  return one_hot_data


In [None]:
#final_X, final_y = get_windowed_data(sub1_x, sub1_y, 10, 1)

In [None]:
X_train, y_train = get_windowed_data(train_x, train_y, 80, 1)

  windowed_y.append(st.mode(labels)[0][0][0])


In [None]:
X_test, y_test = get_windowed_data(test_x, test_y, 80, 1)

  windowed_y.append(st.mode(labels)[0][0][0])


In [None]:
len(test_y[0][0])

9860

In [None]:
y_train = one_hot_encode_data(y_train)
y_test = one_hot_encode_data(y_test)

## Ploting graph of loss function

In [None]:
import matplotlib.pyplot as plt
def plot_history(history):
    # plot loss
    plt.title('Loss')
    plt.plot(history.history['loss'], color='blue', label='train')
    plt.plot(history.history['val_loss'], color='red', label='validation')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'])
    plt.show()
    
    # plot accuracy
    plt.title('Accuracy')
    plt.plot(history.history['accuracy'], color='blue', label='train')
    plt.plot(history.history['val_accuracy'], color='red', label='validation')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'])
    plt.show()

## Importing keras libraries

In [None]:
import keras
from keras.layers import LSTM, Dropout, Dense, Bidirectional
from keras.optimizers import Adam
import tensorflow as tf

In [None]:
sequence_len = 80

In [None]:
y_train_label = []
for i in range(0,len(y_train)):
  y_train_label.append(np.argmax(y_train[i]))

## Class weight
Since there is class imbalance in given dataset, we need to use weighted class values for prediction.

In [None]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train_label), y = y_train_label)

class_weights_dict = dict(enumerate(class_weights))

## Model Architecture

In [None]:
# Creating default model
model = keras.Sequential()
# adding Bidirectional LSTM layer
model.add(Bidirectional(LSTM(100,return_sequences=True, input_shape = (sequence_len,6))))
#adding dropout with 0.2
model.add(Dropout(0.2))
#One more layer of LSTM model with dr
model.add(LSTM(50, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(25, activation="relu"))
model.add(Dense(4, activation="softmax"))
model.compile(optimizer=Adam(learning_rate=0.01),  loss='categorical_crossentropy', metrics = ['accuracy'])
model.build(input_shape = (None,sequence_len,6))
model.summary()
model.fit(X_train, y_train, batch_size=64, epochs=25, verbose=1, validation_data=(X_test, y_test))

In [None]:
plot_history(model)

In [None]:
model = keras.Sequential()
model.add(LSTM(125, input_shape = (sequence_len, 6)))
model.add(Dropout(0.5))
model.add(Dense(units = 125, activation = 'relu'))
model.add(Dense(4, activation = 'softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[keras.metrics.categorical_accuracy])
model.summary()

In [None]:
#model.fit(X_train, y_train, batch_size=64,class_weight=class_weights_dict, epochs=1)

In [None]:
# model.save('/content/drive/MyDrive/ECE542_sp2022_Project_TerrainRecognition/lstm_c1')



In [None]:
#saving model
model.save('lstm_v4.keras')

In [None]:
#loading keras model
model = keras.models.load_model('lstm_80.keras')

In [None]:
model.evaluate(X_test, y_test)



[0.32887038588523865, 0.9222258925437927]

In [None]:
y_test_prob = model.predict(X_test, verbose=1)



In [None]:
y_test_prob.shape

(87497, 4)

In [None]:
y_test_prob[0]

array([9.9982619e-01, 6.2934383e-08, 7.5122807e-06, 1.6609704e-04],
      dtype=float32)

In [None]:
np.argmax(y_test_prob[40])

0

In [None]:
y_test_pred = []
end = len(y_test_prob)
for i in range(0, end):
  y_test_pred.append(np.argmax(y_test_prob[i]))


## Making Predictions

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats as st
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
import keras
from keras.layers import LSTM, Dropout, Dense
import tensorflow as tf

In [None]:
for i in range(0,4):
  print(len(TEST_TIME[i].index))

## Scaling testing Data 

In [None]:
def scale_data(data):
  scalar = StandardScaler()
  scalar = scalar.fit(data)
  data.loc[:,:] = scalar.transform(data.to_numpy())
  return data

## Creating window for testing data

In [None]:
def get_windowed_data_test(data_x, sequence_len, step_size):
  X = data_x.iloc[:,:6]
  X = scale_data(X)
  windowed_X = []
  for i in range(0, len(X) - sequence_len, step_size):
    values = X.iloc[i:(i+sequence_len)].to_numpy()
    windowed_X.append(values)  
  return np.array(windowed_X)


## Converting probability prediction to label 

In [None]:
def get_class_labels(y_test_prob):
  y_test_pred = []
  for i in range(0, len(y_test_prob)):
    y_test_pred.append(np.argmax(y_test_prob[i]))
  return y_test_pred

## Finding X time for every Y time

In [None]:
def match_to_x_time(y_pred, sequence_len):
  y_prediction = []
  for i in range(len(y_pred)+sequence_len):
    if i < sequence_len:
      window = y_pred[0:i+1]
      # print(len(window),i)
      y_prediction.append(st.mode(window)[0][0])
    elif i > (len(y_pred)+1):
      window = y_pred[i-sequence_len:len(y_pred)+1]
      # print(len(window),i)
      y_prediction.append(st.mode(window)[0][0])
    else:
      window = y_pred[i-sequence_len+1:i+1]
      # print(len(window),i)
      y_prediction.append(st.mode(window)[0][0])
  return y_prediction

## Downsampling to get prediction for each Y

In [None]:
def down_sample(y_time, x_time, y_prediction):
  out = []
  for i in range(len(y_time)):
      timestamp = y_time[i]
      index = x_time.searchsorted(timestamp)
      if index == len(x_time):
          index = index - 1
      out.append(y_prediction[index])
  return out


## Saving Predictions to CSV

In [None]:
def save_to_csv(y_pred, name):
    df = pd.DataFrame(y_pred)
    df.to_csv(name, index=False, header=False)

### Loading Test data from path

In [None]:
user = ['009', '010', '011', '012']
prediction_path = "/content/drive/MyDrive/NN_Comp_Pred_C3/subject_{}_01__y.csv"

In [None]:
prediction_path.format(user[0])

'/content/drive/MyDrive/NN_Comp_Pred_C3/subject_009_01__y.csv'

In [None]:
model = keras.models.load_model('bi_lstm_v1.keras')

## Final predictions for Test Data and saving to drive

In [None]:
for i in range(len(TEST_DATA)):
  X_test = get_windowed_data_test(TEST_DATA[i], 80, 1)
  y_test_prob = model.predict(X_test, verbose=1)
  #y_test_prob.shape
  y_pred = get_class_labels(y_test_prob)
  y_matched = match_to_x_time(y_pred, 80)
  #print((y_matched))
  #print(len(TEST_DATA[i]))
  y_test_pred = down_sample(TEST_TIME[i][0], TEST_DATA[i][6], y_matched)
  save_to_csv(y_test_pred, prediction_path.format(user[i]))

## Loading prediction to verify output 

In [None]:
df1 = pd.read_csv("/content/drive/MyDrive/NN_Comp_Pred_C3/subject_009_01__y.csv")
df1

In [None]:
TEST_DATA[i][6]

In [None]:
TEST_TIME[i][0]

In [None]:
index = TEST_DATA[i][6].searchsorted(TEST_TIME[i][0][len(TEST_TIME[i][0])-1])
index

In [None]:
temp = TEST_DATA[0].iloc[:,:6]
temp