In [None]:
!pip install ipython_autotime
%load_ext autotime



In [None]:
%matplotlib nbagg
 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

from functools import partial
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.losses import CategoricalCrossentropy
from keras.layers import Embedding
from keras.utils import to_categorical
 
# dtype=str
 
mem_df = pd.read_excel('memrefs_train_andor_validate.xlsx', header=None, names=['Mem_Acc'], \
                       converters={"Mem_Acc": partial(int, base=16)})

  import pandas.util.testing as tm


time: 10.3 s


In [None]:
mem_df['Freq'] = mem_df.groupby('Mem_Acc')['Mem_Acc'].transform('count')

time: 41.8 ms


In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(8,6))
percentiles = [np.percentile(mem_df['Freq'], p) for p in range(0, 100, 5)]
sns.lineplot(range(0, 100, 5), percentiles)
plt.xlabel('Percentile')
plt.ylabel('Maximum Frequency for given percentile')
plt.show()

<IPython.core.display.Javascript object>

time: 141 ms


We can observe an elbow on the $60^{th}$ percentile, so let's check the value, and set it as a cutoff frequency

In [None]:
print(np.percentile(mem_df['Freq'], 60))

2283.0
time: 9.27 ms


Apply normalization to the memory references

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(mem_df.index[:100000], mem_df['Mem_Acc'][:100000])
plt.xlabel('Timestamp')
plt.ylabel('Memory Access')
plt.show()

<IPython.core.display.Javascript object>

time: 104 ms


We could also get a histogram or a kdeplot for our frequencies

In [None]:
frequencies = np.array(mem_df['Freq'])
plt.figure(figsize=(10, 6))
plt.hist(frequencies)
plt.xlabel('Frequencies')
plt.ylabel('# of such frequencies')
plt.show()

<IPython.core.display.Javascript object>

time: 69 ms


Also, memory accessses above 0.85 will be considered "high memory". whilst all other accesses (seemingly those close to 0), will be "low memory".

So we can thus create a label feature.

In [None]:
mem_df['Deltas'] = mem_df.Mem_Acc.diff().shift(-1)

time: 9.6 ms


In [None]:
mem_df.head()

Unnamed: 0,Mem_Acc,Freq,Deltas
0,3216124696,160,-4.0
1,3216124692,220,-4.0
2,3216124688,224,-4.0
3,3216124684,230,12.0
4,3216124696,160,-4.0


time: 32.8 ms


In [None]:
mem_df['Deltas'].unique().size

30472

time: 109 ms


So, 30472 unique deltas. Now we just have to only get those that are present more than 10 times.

In [None]:
input_vocab = mem_df['Deltas'].value_counts()
input_vocab_df = pd.DataFrame({'delta_val': input_vocab.index, 'delta_count': input_vocab})
input_vocab_df.reset_index(drop=True)

Unnamed: 0,delta_val,delta_count
0,4.000000e+00,62694
1,-4.000000e+00,55455
2,0.000000e+00,31370
3,-8.000000e+00,13008
4,1.200000e+01,7041
...,...,...
30466,3.079551e+09,1
30467,-5.178400e+04,1
30468,-4.265600e+04,1
30469,3.079610e+09,1


time: 118 ms


Now we prune all entries with delta_count $\lt$ 10

In [None]:
input_vocab_df = input_vocab_df.loc[input_vocab_df['delta_count']>=100]
input_vocab_df.reset_index(drop=True)

Unnamed: 0,delta_val,delta_count
0,4.0,62694
1,-4.0,55455
2,0.0,31370
3,-8.0,13008
4,12.0,7041
...,...,...
293,1263471.0,113
294,-64.0,110
295,368.0,105
296,280.0,105


time: 24.8 ms


In [None]:
deltas_df_unprocessed = pd.DataFrame(mem_df['Deltas'].dropna())
deltas_df_unprocessed.head(20)

Unnamed: 0,Deltas
0,-4.0
1,-4.0
2,-4.0
3,12.0
4,-4.0
5,-4.0
6,-4.0
7,12.0
8,-4.0
9,-4.0


time: 25.7 ms


In [None]:
deltas_df_processed = deltas_df_unprocessed.mask(~deltas_df_unprocessed.Deltas.isin(input_vocab_df.delta_val), np.nan)

time: 93.9 ms


In [None]:
deltas_final = deltas_df_processed.dropna()
deltas_final.reset_index(drop=True, inplace=True)

time: 20.3 ms


In [None]:
train_size = int(0.8*(len(deltas_final.index)))
valid_size = int(0.2*(len(deltas_final.index)))

train_dataset = deltas_final.iloc[:train_size]
valid_dataset = deltas_final.iloc[train_size:]

time: 6.32 ms


In [None]:
X_train = []
X_val = []
y_train = []
y_val = []

# First encode vocab
ce_ord = ce.OrdinalEncoder(verbose=1, cols=['delta_val'])
vocab_encoded = ce_ord.fit_transform(input_vocab_df['delta_val'])
vocab_encoded.reset_index(inplace=True)
vocab_encoded.rename(columns={'index':'orig_val'}, inplace=True)

temp_list_X = [train_dataset.iloc[0][0]]
for i in range(1, train_size):
    if (i % 998 != 0):
        temp_list_X.append(train_dataset.iloc[i][0])
    else:
        for i in range(len(vocab_encoded)):
          for j in range(len(temp_list_X)):
            if (temp_list_X[j] == vocab_encoded.loc[i, 'orig_val']):
              temp_list_X[j] = vocab_encoded.loc[i, 'delta_val']
        y_train.append(train_dataset.iloc[i][0])
        X_train.append(temp_list_X)
        temp_list_X = []

temp_list_X = [valid_dataset.iloc[0][0]]
for i in range(1, valid_size):
    if (i % 998 !=0):
        temp_list_X.append(valid_dataset.iloc[i][0])
    else:
        for i in range(len(vocab_encoded)):
          for j in range(len(temp_list_X)):
            if (temp_list_X[j] == vocab_encoded.loc[i, 'orig_val']):
              temp_list_X[j] = vocab_encoded.loc[i, 'delta_val']
        y_val.append(valid_dataset.iloc[i][0])
        X_val.append(temp_list_X)
        temp_list_X = []


time: 16min 56s


In [None]:
# SEQUENCE PADDING SECTION

X_train = pad_sequences(X_train)
X_val = pad_sequences(X_val)


time: 72.1 ms


In [None]:
# ENCODING THE TARGET VECTORS

for i in range(len(vocab_encoded)):
  for j in range(len(y_train)):
    if (y_train[j] == vocab_encoded.loc[i, 'orig_val']):
      y_train[j] = vocab_encoded.loc[i, 'delta_val']

for i in range(len(vocab_encoded)):
  for j in range(len(y_val)):
    if (y_val[j] == vocab_encoded.loc[i, 'orig_val']):
      y_val[j] = vocab_encoded.loc[i, 'delta_val']

time: 1.05 s


In [None]:
# RESHAPING ALL VECTORS FOR 

y_train = np.array(y_train, dtype='float32').reshape(-1,1)
y_val = np.array(y_val, dtype='float32').reshape(-1,1)
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)

y_train_scaled = scaler.fit_transform(y_train)
y_val_scaled = scaler.fit_transform(y_val)

(263, 998) (263, 1)
(65, 998) (65, 1)
time: 30.8 ms


In [None]:
# MODEL STARTS HERE
embedding_size = 128
training_epochs = 4
training_batch_size = 32

model = Sequential()
model.add(Embedding(len(vocab_encoded.index)+1, output_dim=1, input_length=X_train.shape[1]))
model.add(LSTM(embedding_size, recurrent_dropout=0.1))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_scaled, y_train_scaled, epochs=training_epochs, batch_size=training_batch_size)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7fadb3a00ef0>

time: 1min 18s


In [None]:
print(model.summary())
scores = model.evaluate(X_val_scaled, y_val_scaled)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 998, 1)            299       
_________________________________________________________________
lstm (LSTM)                  (None, 128)               66560     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 66,988
Trainable params: 66,988
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 100.00%
time: 956 ms


In [None]:
test_df = pd.read_excel('memrefs_testing_Sept.xlsx', header=None, names=['Mem_Acc'], converters={'Mem_Acc': partial(int, base=16)})

time: 1.33 s


In [None]:
test_df['Deltas'] = test_df.Mem_Acc.diff().shift(-1)
test_df_unprocessed = pd.DataFrame(test_df['Deltas']).dropna()
test_df_processed = test_df_unprocessed.mask(~test_df_unprocessed.Deltas.isin(input_vocab_df.delta_val), np.nan)
test_final = test_df_processed.dropna()
test_final.reset_index(drop=True, inplace=True)

time: 36 ms


In [None]:
X_test = []
y_test = []

temp_list_X = [test_final.iloc[0][0]]
for i in range(1, len(test_final.index)):
    if (i % 998 != 0):
        temp_list_X.append(test_final.iloc[i][0])
    else:
        for i in range(len(vocab_encoded)):
          for j in range(len(temp_list_X)):
            if (temp_list_X[j] == vocab_encoded.loc[i, 'orig_val']):
              temp_list_X[j] = vocab_encoded.loc[i, 'delta_val']
        y_test.append(test_final.iloc[i][0])
        X_test.append(temp_list_X)
        temp_list_X = []

for i in range(len(vocab_encoded)):
  for j in range(len(y_test)):
    if (y_test[j] == vocab_encoded.loc[i, 'orig_val']):
      y_test[j] = vocab_encoded.loc[i, 'delta_val']

time: 2min 8s


In [None]:
X_test = pad_sequences(X_test)
y_test = np.array(y_test, dtype='float32').reshape(-1, 1)
print(X_test.shape)

X_test_scaled = scaler.fit_transform(X_test)
y_test_scaled = scaler.fit_transform(y_test)

X_test_scaled = np.array([X_test_scaled], dtype='float32').reshape(X_test_scaled.shape[0],X_test_scaled.shape[1],1)


(41, 998)
time: 23.7 ms


In [None]:
predictions = model.predict(X_test_scaled)

[[1.        ]
 [0.02380952]
 [0.36444443]
 [0.24892704]
 [0.24892704]
 [0.19727892]
 [0.19727892]
 [0.20422535]
 [0.21481481]
 [0.01792115]]
time: 726 ms


In [None]:
predictions_orig = scaler.inverse_transform(predictions)

[[23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000168]
 [23.000164]
 [23.000168]
 [23.000166]
 [23.000168]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000164]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000181]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000166]
 [23.000168]
 [23.000166]]
time: 5.6 ms


In [None]:
delta_predictions = []

predictions_orig = np.round(predictions_orig, 2)

for pred in predictions_orig:
  for i in range(len(vocab_encoded)):
    if (pred[0] == vocab_encoded.loc[i, 'delta_val']):
      delta_predictions.append(vocab_encoded.loc[i, 'orig_val'])

print(delta_predictions)

[-3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0, -3079682104.0]
time: 172 ms
