In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow
import keras
import time
from sklearn.preprocessing import MinMaxScaler
from keras.utils import np_utils
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")

In [2]:
np.random.seed(1)
tensorflow.random.set_seed(1)
RS=42

In [3]:
## Loading Original Data
## (https://www.kaggle.com/datasets/fanbyprinciple/iot-device-identification?select=iot_device_train.csv)
original_dataset = pd.read_csv('../input/iot-device/iot_device_data.csv')
original_dataset

In [4]:
## Shape: (Total number of data records, Number of features)
original_dataset.shape

In [5]:
## Different features are on different scale, so normalization is needed.
original_dataset.describe(include='all')

In [6]:
## Available Features
list(original_dataset.columns)

In [7]:
## Distribution of Different IoT Device Categories
original_dataset['device_category'].value_counts()

In [8]:
np.unique(original_dataset['device_category'])

In [9]:
plt.figure(figsize=(18,8))
sns.countplot(x='device_category', data=original_dataset)
plt.title('Category Distributions', fontsize=14)
plt.show()

In [10]:
## Replace Categories name with integer values
IoT_device_dataset = original_dataset.replace({'device_category': {'TV':0, 
                                                                  'baby_monitor':1,
                                                                  'smoke_detector':2,
                                                                  'socket':3,
                                                                  'watch':4,
                                                                  'water_sensor':5,
                                                                  'lights':6,
                                                                  'thermostat':7,
                                                                  'motion_sensor': 8,
                                                                  'security_camera':9}
                                             })
shuffled_iot_device_dataset = IoT_device_dataset.sample(frac=1).reset_index(drop=True)
shuffled_iot_device_dataset

In [11]:
## Segregating Data and Output Catgeory Labels
data = shuffled_iot_device_dataset.drop(['device_category'], axis = 1)
output_labels = shuffled_iot_device_dataset['device_category'].to_frame()

In [12]:
# from imblearn.over_sampling import SMOTE

# balancer = SMOTE(random_state=42, sampling_strategy='minority')
# oversampled_data, oversampled_labels = balancer.fit_resample(data, output_labels)

In [13]:
# oversampled_full_dataset = pd.concat([oversampled_data, oversampled_labels], axis = 1)
# oversampled_full_dataset

In [15]:
# plt.figure(figsize=(18,8))
# sns.countplot(x='device_category', data=oversampled_full_dataset)
# plt.title('Category Distributions', fontsize=14)
# plt.show()

In [16]:
# ## Splitting Dataset into Train and Test
# train_data, test_data, train_labels, test_labels = train_test_split(oversampled_data, oversampled_labels, test_size= 0.2, random_state=42)

In [14]:
## Splitting Dataset into Train and Test
train_data, test_data, train_labels, test_labels = train_test_split(data, output_labels, test_size= 0.2, random_state=42)

In [15]:
## Training Data and Labels
print(train_data.shape)
print(train_labels.shape)

In [16]:
## testing Data and Labels
print(test_data.shape)
print(test_labels.shape)

## Data Preprocessing

### Normalisation

In [17]:
# Applying Min-Max Normalization
scaler = MinMaxScaler()

## Normalisation on train data
scaled_train_data = scaler.fit_transform(train_data)
normalized_train_data = pd.DataFrame(scaled_train_data, columns = train_data.columns)
normalized_train_data

In [18]:
## Normalisation on test data
scaled_test_data = scaler.transform(test_data)
normalized_test_data = pd.DataFrame(scaled_test_data, columns = test_data.columns)
normalized_test_data

### Missing Value Check

In [19]:
## on train data (No Missin values as all 297 columns have 1520 records)
normalized_train_data.describe().iloc[0,:].value_counts()

In [20]:
## on test data (No Missin values as all 297 columns have 380 records)
normalized_test_data.describe().iloc[0,:].value_counts()

### Check for NaN's

In [21]:
## check for NaN's in train data (No Nan's are present in the data as well)
set(list(normalized_train_data.isna().sum()))

In [22]:
## check for NaN's in test data (No Nan's are present in the data as well)
set(list(normalized_test_data.isna().sum()))

## Dimensionality Reduction using PCA

In [29]:
## PCA on Train data with 100 principle components
pca = PCA(n_components=100) 
pca_train_data = pca.fit_transform(normalized_train_data)
pca_train_data_df = pd.DataFrame(data = pca_train_data)
pca_train_data_df

In [31]:
plt.figure(figsize= (10,8))
plt.plot(sorted(pca.explained_variance_ratio_.cumsum()))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
plt.title('PCA PLOT : Number of Principal Components v/s Cummulative Explained Variance');

In [51]:
## PCA on Test Data with 100 principle components
pca_test_data = pca.transform(normalized_test_data)
pca_test_data_df = pd.DataFrame(data = pca_test_data)
# print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
pca_test_data_df

### Binarizing Labels

In [52]:
## Binarized Training Labels
binarized_train_labels = np_utils.to_categorical(train_labels, num_classes = 10)
binarized_train_labels

## Binarized Testing Labels
binarized_test_labels = np_utils.to_categorical(test_labels, num_classes = 10)
binarized_test_labels

## Model 1: NN Model without PCA

In [53]:
## Build Model 1
model = keras.Sequential()
model.add(Dense(512, activation='relu', input_shape=(297,)))
model.add(Dropout(rate = 0.1))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate = 0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(10, activation='softmax')) ## Number of Neurons in Last layer == Number of Categories present in Dataset
model.summary()

In [54]:
## Compile Model 1
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [55]:
## Fit Training Data on Neural Network
start_time_1 = time.time()
model_hist = model.fit(normalized_train_data, binarized_train_labels,
                      epochs=50,
                      batch_size= 8,
                      validation_split=0.2,
                      verbose=2)
computational_time_1 = time.time() - start_time_1

In [56]:
## validtion Accuracy vs Epochs
epochs_1 = list(range(0,51))
val_acc_list_1 = model_hist.history['val_accuracy']
val_loss_list_1 = model_hist.history['val_loss']

In [57]:
# Predicting the Test set results
pred_labels_1 = model.predict(normalized_test_data)
pred_labels_1

In [58]:
pred_labels_list_1 = []
for i in pred_labels_1:
    pred_labels_list_1.append(np.argmax(i))

In [59]:
pred_labels_df_1 = pd.DataFrame(pred_labels_list_1, columns=['pred_label_1'])
pred_labels_df_1

In [60]:
## Confusion Matrix
model_1_cm = confusion_matrix(test_labels, pred_labels_df_1)
model_1_cm

In [61]:
## Classification Report
class_names = ['TV', 'baby_monitor', 'smoke_detector','socket',
              'watch',
              'water_sensor',
              'lights',
              'thermostat',
              'motion_sensor',
              'security_camera']

plt.figure(figsize= (10,5))
c1 = sns.heatmap(model_1_cm, annot=True, fmt="d",  xticklabels=class_names, yticklabels=class_names)
c1 = c1.set_title("Confusion Matrix (NN Model without PCA)")


In [62]:
## Classification Report
print(classification_report(test_labels, pred_labels_df_1, target_names=class_names))
print(f"Computational Time by Model 1: {computational_time_1} seconds")

In [63]:
## Evaluation on Test Data
scores = model.evaluate(normalized_test_data, binarized_test_labels)
print(f'Test Accuracy: {scores[1]*100} % && Test Loss: {scores[0]}')

## Model 2: NN Model with PCA

In [75]:
## Building Model 2 with PCA extracted features
model2 = keras.Sequential()
model2.add(Dense(512, activation='relu', input_shape=(100,))) ## input shape is same as PCA extracted features
model2.add(Dropout(rate = 0.25))
model2.add(Dense(256, activation='relu'))
model2.add(Dense(128, activation='relu'))
model2.add(Dropout(rate = 0.25))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(10, activation='softmax')) ## Number of Neurons in Last layer == Number of Categories present in Dataset
model2.summary()

In [76]:
## Compile Model 2
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [77]:
## Fit PCA Training Data on Neural Network
start_time_2 = time.time()
model2_hist = model2.fit(pca_train_data_df, binarized_train_labels,
                      epochs=50,
                      batch_size= 8,
                      validation_split=0.2,
                      verbose=2)
computational_time_2 = time.time() - start_time_2

In [78]:
## validtion Accuracy vs Epochs
epochs = list(range(0,51))
val_acc2_list = model2_hist.history['val_accuracy']
val_loss2_list = model2_hist.history['val_loss']

In [79]:
# Predicting the Test set results
pred_labels_2 = model2.predict(pca_test_data_df)
pred_labels_2

In [80]:
pred_labels_list_2 = []
for i in pred_labels_2:
    pred_labels_list_2.append(np.argmax(i))

In [81]:
pred2_labels_df = pd.DataFrame(pred_labels_list_2, columns=['pred_label_2'])

In [82]:
## Confusion Matrix
model_2_cm = confusion_matrix(test_labels, pred2_labels_df)
model_2_cm

In [83]:
## Classification Report
class_names = ['TV', 'baby_monitor', 'smoke_detector','socket',
              'watch',
              'water_sensor',
              'lights',
              'thermostat',
              'motion_sensor',
              'security_camera']

plt.figure(figsize= (10,5))
s2 = sns.heatmap(model_2_cm, annot=True, fmt="d",  xticklabels=class_names, yticklabels=class_names)
s2 = s2.set_title("Confusion Matrix (NN Model with PCA)")

In [84]:
## Classification Report
print(classification_report(test_labels, pred2_labels_df, target_names=class_names))
print(f"Computational Time by Model 2: {computational_time_2} seconds")

In [85]:
## Evaluation of Model 2
scores2 = model2.evaluate(pca_test_data_df, binarized_test_labels)
print(f'Test Accuracy: {scores2[1]*100} % && Test Loss: {scores2[0]}')

## Model 3: NN Model with SGD Optimizer

In [86]:
## Building Model 3 with PCA extracted features
model3 = keras.Sequential()
model3.add(Dense(512, activation='relu', input_shape=(100,))) ## input shape is same as PCA extracted features
model3.add(Dropout(rate = 0.25))
model3.add(Dense(256, activation='relu'))
model3.add(Dense(128, activation='relu'))
model3.add(Dropout(rate = 0.25))
model3.add(Dense(64, activation='relu'))
model3.add(Dense(10, activation='softmax')) ## Number of Neurons in Last layer == Number of Categories present in Dataset
model3.summary()

In [87]:
## Compile Model 3
model3.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

In [88]:
## Fit PCA Training Data on Neural Network
start_time_3 = time.time()
model3_hist = model3.fit(pca_train_data_df, binarized_train_labels,
                      epochs=50,
                      batch_size= 8,
                      validation_split=0.2,
                      verbose=2)
computational_time_3 = time.time() - start_time_3

In [89]:
## validtion Accuracy vs Epochs
epochs = list(range(0,51))
val_acc3_list = model3_hist.history['val_accuracy']
val_loss3_list = model3_hist.history['val_loss']

In [90]:
# Predicting the Test set results
pred_labels_3 = model3.predict(pca_test_data_df)
pred_labels_list_3 = []
for i in pred_labels_3:
    pred_labels_list_3.append(np.argmax(i))
pred3_labels_df = pd.DataFrame(pred_labels_list_3, columns=['pred_label_3'])

In [91]:
## Confusion Matrix
model_3_cm = confusion_matrix(test_labels, pred3_labels_df)
model_3_cm

In [92]:
## Classification Report
class_names = ['TV', 'baby_monitor', 'smoke_detector','socket',
              'watch',
              'water_sensor',
              'lights',
              'thermostat',
              'motion_sensor',
              'security_camera']

plt.figure(figsize= (10,5))
s3 = sns.heatmap(model_3_cm, annot=True, fmt="d",  xticklabels=class_names, yticklabels=class_names)
s3 = s3.set_title("Confusion Matrix (NN Model with SGD Optimizer)")

In [93]:
## Classification Report
print(classification_report(test_labels, pred3_labels_df, target_names=class_names))
print(f"Computational Time by Model 3: {computational_time_3} seconds")

In [94]:
## Evaluation of Model 3
scores3 = model3.evaluate(pca_test_data_df, binarized_test_labels)
print(f'Test Accuracy: {scores3[1]*100} % && Test Loss: {scores3[0]}')

## Model 4: NN Model with ADAM Optimizer and PCA extracted features - Our Framework Model

In [95]:
## Building Model 4 with PCA extracted features
model4 = keras.Sequential()
model4.add(Dense(512, activation='relu', input_shape=(100,))) ## input shape is same as PCA extracted features
model4.add(Dropout(rate = 0.25))
model4.add(Dense(256, activation='relu'))
model4.add(Dense(128, activation='relu'))
model4.add(Dropout(rate = 0.25))
model4.add(Dense(64, activation='relu'))
model4.add(Dense(10, activation='softmax')) ## Number of Neurons in Last layer == Number of Categories present in Dataset
model4.summary()

In [96]:
## Compile Model 4
model4.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [97]:
## Fit PCA Training Data on Neural Network
start_time_4 = time.time()
model4_hist = model4.fit(pca_train_data_df, binarized_train_labels,
                      epochs=50,
                      batch_size= 8,
                      validation_split=0.2,
                      verbose=2)
computational_time_4 = time.time() - start_time_4

In [98]:
## validtion Accuracy vs Epochs
epochs = list(range(0,51))
val_acc4_list = model4_hist.history['val_accuracy']
val_loss4_list = model4_hist.history['val_loss']

In [99]:
# Predicting the Test set results
pred_labels_4 = model4.predict(pca_test_data_df)
pred_labels_list_4 = []
for i in pred_labels_4:
    pred_labels_list_4.append(np.argmax(i))
pred4_labels_df = pd.DataFrame(pred_labels_list_4, columns=['pred_label_4'])

In [100]:
## Confusion Matrix
model_4_cm = confusion_matrix(test_labels, pred4_labels_df)
model_4_cm

In [101]:
## Classification Report
class_names = ['TV', 'baby_monitor', 'smoke_detector','socket',
              'watch',
              'water_sensor',
              'lights',
              'thermostat',
              'motion_sensor',
              'security_camera']

plt.figure(figsize= (10,5))
s4 = sns.heatmap(model_4_cm, annot=True, fmt="d",  xticklabels=class_names, yticklabels=class_names)
s4 = s4.set_title("Confusion Matrix (NN Model with ADAM Optimizer)")

In [102]:
## Classification Report
print(classification_report(test_labels, pred4_labels_df, target_names=class_names))
print(f"Computational Time by Model 4: {computational_time_4} seconds")

In [103]:
## Evaluation of Model 4
scores4 = model4.evaluate(pca_test_data_df, binarized_test_labels)
print(f'Test Accuracy: {scores4[1]*100} % && Test Loss: {scores4[0]}')