In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [2]:
import os
import pandas as pd
import numpy as np
import sqlite3
from collections import defaultdict
from sklearn.model_selection import train_test_split


def join_tables(tables):
    '''
    Helper function to join tables across different .db files
    '''
    
    df = tables[0]
    for table in tables[1:]:
        df = pd.concat([df, table])
        
    return df.reset_index(drop=True)


def get_all_databases(folder):
    '''
    Joins all files in a data folder and returns relevant dataframes
    '''
    string_tables = []
    ull_tables = []
    files = os.listdir(folder)
    
    # iterating through all files of folder to append tables
    for file in files:
        path = os.path.join(folder, file)
        con = sqlite3.connect(path)
        string_tables.append(pd.read_sql_query('SELECT * FROM COUNTERS_STRING_TIME_DATA', con))
#         ull_tables.append(pd.read_sql_query('SELECT * FROM COUNTERS_ULL_TIME_DATA', con))
        
    # concatenating tables into single dataframe
    string_df = join_tables(string_tables)
    string_df.loc[:, 'VALUE'] = string_df.loc[:, 'VALUE'] 
    string_df.loc[:, 'VALUE'] = string_df.loc[:, 'VALUE'].str.lower()
#     ull_df = join_tables(ull_tables)
    
    return (string_df)

In [3]:
string_df = get_all_databases('OneDrive_2022-03-01/Semester #2 -- Intel collected data/')

In [4]:
df_0 = string_df[string_df['ID_INPUT'] == 1].drop(['ID_INPUT', 'PRIVATE_DATA'], axis=1).reset_index(drop=True)
# df_0 = df_0[df_0['VALUE'] == 'explorer.exe'].reset_index(drop=True)

# converting 'MEASUREMENT_TIME' column to datetime
df_0.loc[:, 'MEASUREMENT_TIME'] = pd.to_datetime(df_0['MEASUREMENT_TIME'])

# using converted datetime column to get usage per application ('TIME_DELTA')
time_delta = (df_0['MEASUREMENT_TIME'].shift(periods=-1) - df_0['MEASUREMENT_TIME']).drop(len(df_0)-1).apply(lambda x: float(x.total_seconds() / 60))
time_delta = time_delta.append(pd.Series(-1), ignore_index=True)

# getting usage per future application ('TIME_DELTA_1')
time_delta_1 = time_delta.shift(-1)

# adding 'TIME_DELTA' and 'TIME_DELTA_1' to DataFrame, dropping last instances with no values, and converting outliers (large numbers) to 60 minute values
df_0 = df_0.assign(**{'TIME_DELTA': time_delta, 'TIME_DELTA_1': time_delta_1})
df_0 = df_0.drop([len(df_0)-2, len(df_0)-1]).drop(['MEASUREMENT_TIME', 'VALUE'], axis=1)
df_0 = df_0.applymap(lambda x: 60 if x > 60 else x)
df_0.head()

Unnamed: 0,TIME_DELTA,TIME_DELTA_1
0,0.541967,0.099883
1,0.099883,0.008133
2,0.008133,0.041767
3,0.041767,0.0167
4,0.0167,0.06665


In [5]:
df_0['MEASUREMENT_TIME_HR'] = pd.to_datetime(string_df['MEASUREMENT_TIME']).dt.hour
df_0['MEASUREMENT_TIME'] = (string_df['MEASUREMENT_TIME'])

In [6]:
df_0['VALUE'] = string_df['VALUE']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_0[['TIME_DELTA','MEASUREMENT_TIME_HR']], df_0['TIME_DELTA_1'], test_size=0.2, shuffle=False)

In [8]:
X_train1, X_test1, null1, null2 = train_test_split(pd.get_dummies(df_0['VALUE']), df_0['TIME_DELTA_1'], test_size=0.2, shuffle=False)

In [9]:
X_train = X_train.join(X_train1)
X_test = X_test.join(X_test1)

In [11]:
X_train

Unnamed: 0,TIME_DELTA,MEASUREMENT_TIME_HR,besclientui.exe::wc1,chrome.exe::fc0,chrome.exe::wc1,cleanmgr.exe::wc1,cmd.exe::wc1,db browser for sqlcipher.exe::wc1,db browser for sqlite.exe::wc1,desktop,...,teams.exe::wc1,vpndownloader.exe::wc1,vpnui.exe::wc1,werfault.exe::wc1,winword.exe::fc0,winword.exe::wc1,wlrmdr.exe::fc0,workpace.exe::fc0,zoom.exe::fc0,zoom.exe::wc1
0,0.541967,17,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.099883,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.008133,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.041767,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.016700,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5233,0.016700,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5234,0.008350,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5235,0.241833,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5236,0.049767,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
model = keras.Sequential()
model.add(layers.LSTM(256, return_sequences=True))
model.add(layers.Dense(units=1, activation='relu'))
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanAbsoluteError())

In [35]:
model.fit(x=X_train.to_numpy().reshape(-1,1 ,45), y= y_train, epochs= 100, validation_data = (X_test.to_numpy().reshape(-1,1 ,45),(y_test)))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x27ca19c9340>

# LOGISTIC REGRESSION 86% ACCURACY

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
pd.get_dummies(data['VALUE']).to_numpy()

In [None]:
lr.fit(data[['MEASUREMENT_TIME_HOUR_ONLY','TIME_DELTA']],(data['VALUE']))

In [None]:
sum(pd.Series(lr.predict(data[['MEASUREMENT_TIME_HOUR_ONLY','TIME_DELTA']])) == pd.Series(data['VALUE']))/len(data['VALUE'])

In [None]:
pred = pd.Series(lr.predict(data[['MEASUREMENT_TIME_HOUR_ONLY','TIME_DELTA']]))

In [None]:
i =0
for x in range(len(data['VALUE'])- 10):
    if pred[x] in data['VALUE'][(x):(x+10)].values:
        i += 1

In [None]:
i/len(data['VALUE'])