In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import math, os
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
# from tensorflow.keras.models import Sequential
# from keras.layers.convolutional import Conv1D
# from keras.layers.convolutional import MaxPooling1D
# from keras.utils import to_categorical
# from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, Conv2D, MaxPooling1D,MaxPooling2D, LSTM


In [2]:
def dataframe_of_CSI(directory):
    # Create empty DataFrames for walk, up, and jog
    df_walk = pd.DataFrame()
    df_up = pd.DataFrame()
    df_jog = pd.DataFrame()

    # Loop through each file in the directory
    for file in os.listdir(directory):
        # Check if the file is a CSV file and contains "walk", "up", or "jog" in the name
        if file.endswith(".csv") and ("walk" in file or "up" in file or "jog" in file):
            # Read the CSV file and extract the CSI_DATA column
            file_path = os.path.join(directory, file)
            df = pd.read_csv(file_path)
            csi_rows_raw = []

            ## Filtering can be done using
            df = df[(df["bandwidth"]==0)]# & (df["secondary_channel"]==1)]

            ## Ignore first few and last few seconds data
            for one_row in df['CSI_DATA'].iloc[40:-40]:
                one_row = one_row.strip("[]")
                csi_row_raw = [int(x) for x in one_row.split(" ") if x != '']
                csi_rows_raw.append(csi_row_raw)
        
            # Convert the list of lists to a DataFrame and append it to the appropriate DataFrame based on the file name
            csi_df = pd.DataFrame(csi_rows_raw)

            # Check which dataframe we are working on and concat the data
            if "walk" in file:
                df_walk = pd.concat([df_walk, csi_df], axis=0)
            elif "up" in file:
                df_up = pd.concat([df_up, csi_df], axis=0)
            else:
                df_jog = pd.concat([df_jog, csi_df], axis=0)
    return df_walk, df_up, df_jog

In [3]:
# Define the directory path where the CSV files are located
directory = "C:\\Users\\Dell\\Documents\\Wifi-Sensing-HAR\\data\\our_data"

walk_df, up_df, jog_df = dataframe_of_CSI(directory)

In [4]:
print("The number of entries found is: ")
print(len(walk_df),len(up_df),len(jog_df))

The number of entries found is: 
29093 3387 63015


In [5]:
## Extract Amplitude and Phase from the dataframe
def convert_csi_to_amplitude_phase(df):
    total_amplitudes = []
    total_phases = []

    for i, value in enumerate(df.values):
        imaginary = []
        real = []
        amplitudes = [] 
        phases = []

        csi_one_row_lst = value.tolist()

         # Create list of imaginary and real numbers from CSI
        [imaginary.append(csi_one_row_lst[item]) if item%2==0 else real.append(csi_one_row_lst[item]) for item in range(len(csi_one_row_lst))]

        # Transform imaginary and real into amplitude and phase
        val = int(len(csi_one_row_lst)//2)
        for k in range(val):
            amplitudes.append(round(math.sqrt(float(imaginary[k])** 2 + float(real[k])** 2),4))
            phases.append(round(math.atan2(float(imaginary[k]), float(real[k])),4))
        total_amplitudes.append(np.array(amplitudes))
        total_phases.append(np.array(phases))
    
    total_amplitudes_df = pd.DataFrame(total_amplitudes)
    total_phases_df = pd.DataFrame(total_phases)

        
    return total_amplitudes_df, total_phases_df

#### Amplitude and Phase of Walking

In [6]:
## Extract walk amplitude and phase
walk_amplitudes_df, walk_phases_df = convert_csi_to_amplitude_phase(walk_df)

## Here, based on sig_mode, 802.11a/g/n received. Here we receive both 802.11a/g and 802.11n
## So, either 52 or 56 total sub-carrier would be useful. The first 4 and the last 4 are rejected as null guard.


## Amplitude
walk_df1_amps = walk_amplitudes_df.iloc[:,5:32]  # 6:32 for 802.11ag 4:32 for 802.11n
walk_df2_amps = walk_amplitudes_df.iloc[:,33:60] # 33:59 for 802.11ag 33:61 for 802.11n

walk_df_amps_final = pd.concat([walk_df1_amps, walk_df2_amps],axis=1)


## Phase
walk_df1_phase = walk_amplitudes_df.iloc[:,5:32]  # 6:32 for 802.11ag 4:32 for 802.11n
walk_df2_phase = walk_amplitudes_df.iloc[:,33:60] # 33:59 for 802.11ag 33:61 for 802.11n

walk_df_phase_final = pd.concat([walk_df1_phase, walk_df2_phase],axis=1)

### Amplitude and Phase of Jogging

In [7]:
## Extract jog amplitude and phase
jog_amplitudes_df, jog_phases_df = convert_csi_to_amplitude_phase(jog_df)

## Amplitude
jog_df1_amps = jog_amplitudes_df.iloc[:,5:32]  # 6:32 for 802.11ag 4:32 for 802.11n
jog_df2_amps = jog_amplitudes_df.iloc[:,33:60] # 33:59 for 802.11ag 33:61 for 802.11n

jog_df_amps_final = pd.concat([jog_df1_amps, jog_df2_amps],axis=1)


## Phase
jog_df1_phase = jog_amplitudes_df.iloc[:,5:32]  # 6:32 for 802.11ag 4:32 for 802.11n
jog_df2_phase = jog_amplitudes_df.iloc[:,33:60] # 33:59 for 802.11ag 33:61 for 802.11n

jog_df_phase_final = pd.concat([jog_df1_phase, jog_df2_phase],axis=1)

### Amplitude and Phase of Stand Up Down

In [8]:
## Extract up amplitude and phase
up_amplitudes_df, up_phases_df = convert_csi_to_amplitude_phase(up_df)

## Amplitude
up_df1_amps = up_amplitudes_df.iloc[:,5:32]  # 6:32 for 802.11ag 4:32 for 802.11n
up_df2_amps = up_amplitudes_df.iloc[:,33:60] # 33:59 for 802.11ag 33:61 for 802.11n

up_df_amps_final = pd.concat([up_df1_amps, up_df2_amps],axis=1)


## Phase
up_df1_phase = up_amplitudes_df.iloc[:,5:32]  # 6:32 for 802.11ag 4:32 for 802.11n
up_df2_phase = up_amplitudes_df.iloc[:,33:60] # 33:59 for 802.11ag 33:61 for 802.11n

up_df_phase_final = pd.concat([up_df1_phase, up_df2_phase],axis=1)

#### Moving Average of the data

In [12]:
# Moving average of the data
def moving_average(df, window_size):
    """"
    Compute the moving average with a window of size specified
    """

    rolling_mean = df.rolling(window=window_size).mean()
    downsampled = rolling_mean.iloc[window_size::window_size, :]
    return downsampled


## Set moving average window of desired size 
window_size = 1
mov_avg_walk_amps_df = moving_average(walk_df_amps_final,window_size)
mov_avg_walk_phase_df = moving_average(walk_df_phase_final,window_size)
mov_avg_jog_amps_df = moving_average(jog_df_amps_final,window_size)
mov_avg_jog_phase_df = moving_average(jog_df_phase_final,window_size)
mov_avg_up_amps_df = moving_average(up_df_amps_final,window_size)
mov_avg_up_phase_df = moving_average(up_df_phase_final,window_size)

### Select n samples of data for input to the system as a flattened matrix

In [10]:
def select_data_portion(dataFrm,select_size):
    selected_df_list = []
    for item in range(0,len(dataFrm)-select_size, select_size):
        selected_df = dataFrm.iloc[item:item+select_size].to_numpy().flatten()
        selected_df_list.append(selected_df)
    selected_df = pd.DataFrame(selected_df_list)
    return selected_df

In [22]:
X_walk = select_data_portion(mov_avg_walk_amps_df, 50)
X_jog = select_data_portion(mov_avg_jog_amps_df, 50)
X_up = select_data_portion(mov_avg_up_amps_df, 50)
# X_walk = select_data_portion(walk_df_amps_final, 50)
# X_jog = select_data_portion(jog_df_amps_final, 50)
# X_up = select_data_portion(up_df_amps_final, 50)

X_training = pd.concat([X_walk,X_jog,X_up],axis=0,ignore_index=True)
# X_training

NameError: name 'mov_avg_walk_amps_df' is not defined

In [None]:
y_walk = np.zeros(len(X_walk))
y_jog = np.ones(len(X_jog))
y_up = np.ones(len(X_up))+1

y_training = np.concatenate([y_walk, y_jog, y_up],axis=0)
y_training

array([0., 0., 0., ..., 2., 2., 2.])

### Scaling of the dataframe

In [None]:
from sklearn.preprocessing import StandardScaler

def perform_scaling(df):
    scaler = StandardScaler()
    scaler = scaler.fit(df)
    scaled_data = scaler.transform(df)
    return scaled_data

## PCA for Visualization

Here we have 52 to 54 usable columns. Not all columns/subcarriers are useful. So, we need to select only the useful ones. This can be done by PCA.

### PCA with arbitary n_components or variance

In [None]:
def perform_pca(X, n_components):
    """
    Perform PCA on the data.
    """
    pca = PCA(n_components=n_components)
    pca.fit(X)
    new_sample = pca.transform(X)
    return pca, new_sample

In [None]:
scaled_X = perform_scaling(X_training)
pca_obj, pca_X = perform_pca(scaled_X, 0.95)

In [None]:
np.array(pca_X).shape

(1908, 1033)

In [None]:
print(f'Total number of components used after PCA : {pca_obj.n_components_}')

Total number of components used after PCA : 1033


------

### Data Splitting for Model

In [None]:
def train_test_split_data(X, y, test_size=0.2, random_state=42):
    """
    Split data into training and testing sets.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify = y, random_state=random_state)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(pca_X, y_training)

### Model Creation

In [None]:
def train_svm(X_train, y_train):
    """
    Train SVM model using the training data.
    """
    # svm = SVC(C=10, gamma=0.001)
    svm = SVC()
    svm.fit(X_train, y_train)
    return svm


def train_knn(X_train, y_train):
    """
    Train KNN model using the training data.
    """
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    return knn

def train_model(model_type, X_train, y_train):
    """
    Train model of given type using the training data.
    """
    if model_type == 'svm':
        model = train_svm(X_train, y_train)
    elif model_type == 'knn':
        model = train_knn(X_train, y_train)
    # elif model_type == 'cnn':
    #     model = train_cnn(X_train, y_train)
    # elif model_type == 'lstm':
    #     model = train_lstm(X_train, y_train)
    else:
        raise ValueError('Invalid model type.')
    return model


def test_model(model, X_test):
    """
    Evaluate the trained model on the testing data.
    """ 
    label = model.predict(X_test)
    return label

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the trained model on the testing data.
    """ 
    score = model.score(X_test, y_test)
    return score

In [23]:
svm = train_model('svm', X_train, y_train)
y_pred = test_model(svm, X_test)
print(evaluate_model(svm, X_test, y_test))
confusion_matrix(y_test, y_pred)

0.9014675052410901


array([[119,  23,   0],
       [  8, 308,   0],
       [  7,   9,   3]], dtype=int64)

In [24]:
svm = train_model('knn', X_train, y_train)
y_pred = test_model(svm, X_test)
print(evaluate_model(svm, X_test, y_test))
confusion_matrix(y_test, y_pred)

0.7987421383647799


array([[ 64,  84,   5],
       [  1, 311,   0],
       [  2,   4,   6]], dtype=int64)

In [25]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)


grid_predictions = grid.predict(X_test)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.662 total time=   3.3s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.664 total time=   2.7s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.664 total time=   2.6s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.661 total time=   2.6s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.661 total time=   2.3s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.662 total time=   2.3s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.664 total time=   2.2s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.664 total time=   2.2s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.661 total time=   2.3s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.661 total time=   2.2s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.662 total time=   2.2s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

NameError: name 'classification_report' is not defined

In [26]:
from sklearn.metrics import classification_report


# print classification report
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

         0.0       0.87      0.88      0.87       153
         1.0       0.94      0.96      0.95       312
         2.0       1.00      0.17      0.29        12

    accuracy                           0.92       477
   macro avg       0.94      0.67      0.70       477
weighted avg       0.92      0.92      0.91       477



In [27]:
print(grid.best_params_)


{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}


In [28]:
print(grid.best_estimator_)


SVC(C=10, gamma=0.001)
