#### **Install these in command prompt before execution**

In [None]:
!pip install pandas
!pip install numpy 
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn 
!pip install missingno

#### **Import and combine data**

In [1]:
import pandas as pd

# List of file paths for the 7 parts
file_paths = [f'dataset/uwb_dataset_part{i}.csv' for i in range(1, 8)]

# Load each part into a list of DataFrames
data_parts = [pd.read_csv(file_path, header=0) for file_path in file_paths]

In [2]:
# Combine all parts into a single DataFrame
combined_data = pd.concat(data_parts, ignore_index=True)

# Display the combined dataset
print(combined_data.head())
print(f"Combined dataset shape: {combined_data.shape}")

   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \
0   0.0   3.90   745.0  18712.0  10250.0  11576.0         64.0  11855.0   
1   0.0   0.66   749.0  11239.0   6313.0   4712.0         64.0  18968.0   
2   1.0   7.86   746.0   4355.0   5240.0   3478.0         60.0  14699.0   
3   1.0   3.48   750.0   8502.0   8416.0   5890.0         76.0   8748.0   
4   0.0   1.19   746.0  17845.0  18095.0  12058.0         68.0  11380.0   

   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \
0      967.0   611.0  ...    279.0    458.0    183.0    158.0    198.0   
1     1133.0   447.0  ...    144.0    334.0    290.0    228.0    187.0   
2      894.0   723.0  ...     32.0    373.0    224.0    174.0    124.0   
3     1127.0  1024.0  ...    252.0    173.0    198.0    160.0    434.0   
4     1744.0   276.0  ...    154.0    209.0    242.0    296.0     87.0   

   CIR1011  CIR1012  CIR1013  CIR1014  CIR1015  
0     87.0    296.0    505.0    307.0      0.0  
1    2

In [3]:
# Check the number of rows in each part
for i, part in enumerate(data_parts, start=1):
    print(f"Part {i} shape: {part.shape}")

# Check the combined dataset
print(f"Combined dataset shape: {combined_data.shape}")
print(combined_data.head())

Part 1 shape: (6000, 1031)
Part 2 shape: (6000, 1031)
Part 3 shape: (6000, 1031)
Part 4 shape: (6000, 1031)
Part 5 shape: (6000, 1031)
Part 6 shape: (6000, 1031)
Part 7 shape: (6000, 1031)
Combined dataset shape: (42000, 1031)
   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \
0   0.0   3.90   745.0  18712.0  10250.0  11576.0         64.0  11855.0   
1   0.0   0.66   749.0  11239.0   6313.0   4712.0         64.0  18968.0   
2   1.0   7.86   746.0   4355.0   5240.0   3478.0         60.0  14699.0   
3   1.0   3.48   750.0   8502.0   8416.0   5890.0         76.0   8748.0   
4   0.0   1.19   746.0  17845.0  18095.0  12058.0         68.0  11380.0   

   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \
0      967.0   611.0  ...    279.0    458.0    183.0    158.0    198.0   
1     1133.0   447.0  ...    144.0    334.0    290.0    228.0    187.0   
2      894.0   723.0  ...     32.0    373.0    224.0    174.0    124.0   
3     1127.0  1024.0  ... 

#### **Data Cleaning**

In [4]:
# Find columns with missing values
missing_values = combined_data.isnull().sum()
missing_cols = missing_values[missing_values > 0]
print("Columns with missing values:")
if not missing_cols.empty:
    print(missing_cols)
else:
    print("No missing values found.")

Columns with missing values:
No missing values found.


In [5]:
# duplicate rows count
duplicate_rows = combined_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

# print duplicate rows if they exist
if duplicate_rows > 0:
    print("Sample duplicate rows:")
    print(combined_data[combined_data.duplicated()].head())

Number of duplicate rows: 0


In [6]:
# find columns with only 1 unique value
unique_counts = combined_data.nunique()
single_value_cols = unique_counts[unique_counts == 1]
print("Columns with only one unique value:")
if not single_value_cols.empty:
    print(single_value_cols)
else:
    print("No columns with a single unique value.")

Columns with only one unique value:
CH         1
BITRATE    1
PRFR       1
dtype: int64


In [7]:
# drop columns with only one unique value
unique_counts = combined_data.nunique()
cols_to_drop = unique_counts[unique_counts == 1].index
combined_data = combined_data.drop(columns=cols_to_drop)

- CH: only has **ONE** value which is '2'
- BITRATE: only has **ONE** value which is '110'
- PRFR: only has **ONE** value which is '64'

In [8]:
# check current combined dataset
print(f"Current Combined dataset shape: {combined_data.shape}")
print(combined_data.head())

Current Combined dataset shape: (42000, 1028)
   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \
0   0.0   3.90   745.0  18712.0  10250.0  11576.0         64.0  11855.0   
1   0.0   0.66   749.0  11239.0   6313.0   4712.0         64.0  18968.0   
2   1.0   7.86   746.0   4355.0   5240.0   3478.0         60.0  14699.0   
3   1.0   3.48   750.0   8502.0   8416.0   5890.0         76.0   8748.0   
4   0.0   1.19   746.0  17845.0  18095.0  12058.0         68.0  11380.0   

   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \
0      967.0   611.0  ...    279.0    458.0    183.0    158.0    198.0   
1     1133.0   447.0  ...    144.0    334.0    290.0    228.0    187.0   
2      894.0   723.0  ...     32.0    373.0    224.0    174.0    124.0   
3     1127.0  1024.0  ...    252.0    173.0    198.0    160.0    434.0   
4     1744.0   276.0  ...    154.0    209.0    242.0    296.0     87.0   

   CIR1011  CIR1012  CIR1013  CIR1014  CIR1015  
0     87.

In [9]:
# Identify non-numeric columns
non_numeric_cols = combined_data.select_dtypes(exclude=['number']).columns

# Print non-numeric columns
print("Non-numeric columns:")
print(non_numeric_cols.tolist())

Non-numeric columns:
[]


In [10]:
# Define special characters
special_chars = r"[!@#$%^&*()_\-?/<>]"

# Find columns with special characters
cols_with_special_chars = [col for col in combined_data.columns if combined_data[col].astype(str).str.contains(special_chars, regex=True).any()]

# Print columns with special characters
print("Columns containing special characters in values:")
print(cols_with_special_chars)

Columns containing special characters in values:
[]


#### **PCA for feature reduction(lien's):**

In [11]:
# generate a copy for PCA 
pca_data = combined_data.copy()

# Step 1: Compute the mean (𝛍) for each column and center the data
mean_vals = pca_data.mean(axis=0)   # Compute mean for each feature
centered_data = pca_data - mean_vals    # Center the data by subtracting the mean
print(centered_data.head())

   NLOS     RANGE    FP_IDX       FP_AMP1      FP_AMP2      FP_AMP3  \
0  -0.5  0.068481 -0.654167  10584.478095 -1175.259524  1837.893952   
1  -0.5 -3.171519  3.345833   3111.478095 -5112.259524 -5026.106048   
2   0.5  4.028481  0.345833  -3772.521905 -6185.259524 -6260.106048   
3   0.5 -0.351519  4.345833    374.478095 -3009.259524 -3848.106048   
4  -0.5 -2.641519  0.345833   9717.478095  6669.740476  2319.893952   

   STDEV_NOISE      CIR_PWR   MAX_NOISE    RXPACC  ...     CIR1006  \
0    -8.284571  2065.309786 -349.096524   -5.2725  ...   21.544643   
1    -8.284571  9178.309786 -183.096524 -169.2725  ... -113.455357   
2   -12.284571  4909.309786 -422.096524  106.7275  ... -225.455357   
3     3.715429 -1041.690214 -189.096524  407.7275  ...   -5.455357   
4    -4.284571  1590.309786  427.903476 -340.2725  ... -103.455357   

      CIR1007    CIR1008    CIR1009     CIR1010     CIR1011    CIR1012  \
0  190.769167 -66.123929 -67.917143  -41.445476 -153.034286  41.611905   
1   

In [12]:
import numpy as np

# Step 2: Calculate the covariance matrix
cov_matrix = np.cov(centered_data.T)  # Covariance of features

In [13]:
# Step 3: Find eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

In [14]:
# Step 4: Sort eigenvectors based on eigenvalues (descending order)
sorted_indices = np.argsort(eigenvalues)[::-1]  # Get indices of sorted eigenvalues
eigenvalues = eigenvalues[sorted_indices]  # Sort eigenvalues
eigenvectors = eigenvectors[:, sorted_indices]  # Sort eigenvectors accordingly

In [15]:
# Step 5: Select the top k principal components for dimensionality reduction
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)  # Compute variance ratio
cumulative_variance = np.cumsum(explained_variance_ratio)  # Compute cumulative variance
k = np.argmax(cumulative_variance >= 0.95) + 1  # Select k PCs that retain 95% variance
top_k_eigenvectors = eigenvectors[:, :k]  # Extract top k eigenvectors

In [16]:
# Step 6: Project the data onto the selected principal components
pca_data = np.dot(centered_data, top_k_eigenvectors)  # New dataset with reduced dimensions

In [19]:
# Assign feature names to the principal components
pca_feature_names = [combined_data.columns[np.argmax(abs(top_k_eigenvectors[:, i]))] for i in range(k)]

In [20]:
# Convert the transformed data into a DataFrame with feature-based names
pca_df = pd.DataFrame(pca_data, columns=pca_feature_names)

In [21]:
# Display results
print(f"Original Data Shape: {pca_data.shape}")
print(f"Reduced Data Shape: {pca_df.shape}")
print(f"Explained Variance by {k} PCs: {cumulative_variance[k-1]:.4f}")

Original Data Shape: (42000, 37)
Reduced Data Shape: (42000, 37)
Explained Variance by 37 PCs: 0.9507


In [22]:
print(pca_df.head())

        CIR_PWR       FP_AMP2        CIR746       CIR749       CIR753  \
0   3386.432268 -14010.810445   6696.255997 -2994.301662  5013.270382   
1 -25211.032846  15485.323901   7584.883629  7528.293646  9002.227168   
2 -11518.997733  15024.755373  12090.141478  -620.407129  4393.560546   
3  -6011.290735  14757.546733     65.982138  5330.611182  2077.111037   
4   3016.688967 -19785.758466  -3611.328990 -6632.572121  2564.297356   

        CIR751       CIR752       CIR751      FP_AMP1       CIR753  ...  \
0 -4346.186792  1063.425598   -24.123417 -8588.983555  3716.470863  ...   
1 -2270.167968  2291.911975  -148.339989 -3963.637766  8548.169521  ...   
2 -1786.441455 -1354.733395  1358.080263 -1101.967653 -4335.936147  ...   
3 -4919.782495   890.070749  1982.166910 -3083.771847  2289.133716  ...   
4 -2358.173939   428.188749  4589.031078 -5464.550770 -1328.727111  ...   

        CIR765       CIR743       CIR770       CIR769       CIR770  \
0  -386.815902 -2450.568632  -513.582577

In [23]:
for col in pca_df.columns:
    print(col)

CIR_PWR
FP_AMP2
CIR746
CIR749
CIR753
CIR751
CIR752
CIR751
FP_AMP1
CIR753
CIR754
CIR755
CIR756
CIR757
CIR756
CIR757
CIR759
CIR760
CIR761
CIR748
CIR761
CIR762
CIR762
CIR763
FP_AMP3
CIR764
CIR765
CIR765
CIR743
CIR770
CIR769
CIR770
CIR771
CIR_PWR
CIR772
CIR776
CIR775


#### **PCA for feature reduction(jf's):**

In [None]:
from pandas.api.types import is_numeric_dtype

for col in combined_data.columns:
    if is_numeric_dtype(combined_data[col]):
        print('%s:' % (col))
        print('\t Mean = %.2f' % combined_data[col].mean())
        print('\t Standard deviation = %.2f' % combined_data[col].std())
        print('\t Minimum = %.2f' % combined_data[col].min())
        print('\t Maximum = %.2f' % combined_data[col].max())    

In [None]:
print(combined_data.head())

In [None]:
print('Covariance:')
combined_data.cov(numeric_only=True)

In [None]:
print('Correlation:') 
combined_data.corr(numeric_only=True) #range -1 to +1

In [None]:
combined_data.describe()

In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of NLOS and LOS
nlos_count = combined_data['NLOS'].value_counts()[1]
los_count = combined_data['NLOS'].value_counts()[0]

# Print the counts
print(f'NLOS count: {nlos_count}')
print(f'LOS count: {los_count}')

# Plot the distribution of the target variable (NLOS)
plt.figure(figsize=(6, 4))
combined_data['NLOS'].value_counts().plot(kind='bar', color=['blue', 'orange'])
plt.title('Distribution of NLOS (0 = LOS, 1 = NLOS)')
plt.xlabel('NLOS')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()


In [None]:
# Define all numerical features
numerical_features = [
    'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3', 
    'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC', 
    'CH', 'FRAME_LEN', 'PREAM_LEN', 'BITRATE', 'PRFR'
]

# Set up the plot
plt.figure(figsize=(18, 14))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(4, 4, i)
    plt.hist(combined_data[feature], bins=30, color='blue', alpha=0.7)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

# Adjust layout for readability
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns

# Compute the correlation matrix
corr_matrix = combined_data[numerical_features].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
# Plot boxplots for numerical features
# Set up the plot
plt.figure(figsize=(18, 14))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(y=combined_data[feature], color='orange')
    plt.title(f'Boxplot of {feature}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Plot CIR values for a few samples
plt.figure(figsize=(12, 6))
for i in range(5):  # Plot first 5 samples
    plt.plot(combined_data.loc[i, 'CIR1':'CIR1015'], label=f'Sample {i+1}')
plt.title('CIR Values for First 5 Samples')
plt.xlabel('CIR Index')
plt.ylabel('CIR Value')
plt.legend()
plt.show()

In [None]:
# Check for missing values in each column
missing_data = combined_data.isnull().sum()

# Display columns with missing values
if missing_data.sum() > 0:
    print("Missing Data:")
    print(missing_data[missing_data > 0])
else:
    print("No missing data.")

In [None]:
# Check for duplicate rows
duplicate_rows = combined_data.duplicated()

# Display the number of duplicate rows
print(f"Number of duplicate rows: {duplicate_rows.sum()}")

# Display duplicate rows
print("Duplicate Data:")
print(combined_data[duplicate_rows])

In [None]:
# Calculate the variance of each column
variances = combined_data.var()

# Identify columns with zero or very low variance
low_variance_columns = variances[variances < 1e-10].index.tolist()

print("Columns with low variance:")
print(low_variance_columns)

In [None]:
# Drop columns with low variance
data_cleaned = combined_data.drop(columns=low_variance_columns)

print(f"Shape after dropping low-variance columns: {data_cleaned.shape}")

In [None]:
from scipy.stats import zscore
numerical_features = [
    'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3', 
    'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC', 
    'FRAME_LEN', 'PREAM_LEN'
]

z_scores = data_cleaned[numerical_features].apply(zscore)

# Identify outliers (absolute Z-score > 3)
outliers = (abs(z_scores) > 3).any(axis=1)

# Display rows with outliers
print(data_cleaned[outliers])

In [None]:
# Remove rows with outliers
data_cleaned = data_cleaned[~outliers]
print(data_cleaned.shape)

In [None]:
# Drop the class attribute (NLOS)
data_without_class = combined_data.drop(columns=['NLOS'])

# Display the dataset without the class attribute
print(data_without_class.head())

In [None]:
import matplotlib.pyplot as plt

# Select non-CIR numerical columns for visualization
non_cir_columns = ['RANGE', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3', 'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC', 'CH', 'FRAME_LEN', 'PREAM_LEN', 'BITRATE', 'PRFR']

# Plot boxplots for non-CIR numerical columns
plt.figure(figsize=(14, 8))
data_without_class[non_cir_columns].boxplot()
plt.title('Boxplot of Non-CIR Numerical Attributes')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Calculate the variance of each column
variances = data_without_class.var()

# Identify columns with zero or very low variance
low_variance_columns = variances[variances < 1e-10].index.tolist()

print("Columns with low variance:")
print(low_variance_columns)

In [None]:
# Drop columns with low variance
data_without_class = data_without_class.drop(columns=low_variance_columns)

print(f"Shape after dropping low-variance columns: {data_without_class.shape}")

In [None]:
# 1. Convert data to numeric
data_numeric = data_without_class.select_dtypes(include=['number'])

# 2. Calculate Z-scores
Z = (data_numeric - data_numeric.mean()) / data_numeric.std()

In [None]:
# 3. Identify and remove outliers
print('Number of rows before discarding outliers = %d' % (Z.shape[0]))

# Remove rows with any Z-score outside the range [-3, 3]
Z2 = Z.loc[((Z > -3).sum(axis=1) == Z.shape[1]) & ((Z <= 3).sum(axis=1) == Z.shape[1]), :]

print('Number of rows after discarding outliers = %d' % (Z2.shape[0]))

In [None]:
data_cleaned = data_without_class.loc[Z2.index]
# Display the cleaned dataset
print("Cleaned dataset:")
print(data_cleaned)

In [None]:
import pandas as pd
# Aggregate CIR columns (CIR1 to CIR1015)
# Ensure we are working with a copy of the DataFrame
data_cleaned = data_cleaned.copy()

# 1. Aggregation
# Aggregate CIR columns (CIR1 to CIR1015)
cir_columns = [f'CIR{i}' for i in range(1, 1016)]
data_cleaned.loc[:, 'CIR_MEAN'] = data_cleaned[cir_columns].mean(axis=1)
data_cleaned.loc[:, 'CIR_STD'] = data_cleaned[cir_columns].std(axis=1)
data_cleaned.loc[:, 'CIR_MAX'] = data_cleaned[cir_columns].max(axis=1)
data_cleaned.loc[:, 'CIR_MIN'] = data_cleaned[cir_columns].min(axis=1)

# Drop the original CIR columns (optional)
#data_cleaned = data_cleaned.drop(columns=cir_columns)



In [None]:
# 2. Sampling
# Randomly sample 10% of the dataset
sampled_data = data_cleaned.sample(frac=0.1, random_state=42)
print(f"Shape of sampled dataset: {sampled_data.shape}")

In [None]:
import pandas as pd
# 3. Discretization
# Discretize the RANGE feature into 5 bins
data_cleaned.loc[:, 'RANGE_BIN'] = pd.cut(data_cleaned['RANGE'], bins=5, labels=["Very Low", "Low", "Medium", "High", "Very High"])

# Discretize the CIR_PWR feature into quartiles
data_cleaned.loc[:, 'CIR_PWR_BIN'] = pd.qcut(data_cleaned['CIR_PWR'], q=4, labels=["Q1", "Q2", "Q3", "Q4"])

# Display the final dataset
print(data_cleaned.head())

In [None]:
data_cleaned.head()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Extract CIR columns (CIR1 to CIR1015)
cir_columns = [f'CIR{i}' for i in range(1, 1016)]
cir_data = data[cir_columns]

# Apply PCA
pca = PCA()
pca.fit(cir_data)

# Plot explained variance ratio
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_.cumsum(), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid(True)
plt.show()

# Choose the number of components that explain >95% of the variance
n_components = len(pca.explained_variance_ratio_[pca.explained_variance_ratio_.cumsum() <= 0.95])
print(f"Number of components to retain: {n_components}")

# Apply PCA with selected number of components
pca = PCA(n_components=n_components)
cir_reduced = pca.fit_transform(cir_data)

# Add reduced CIR features back to the dataset
for i in range(n_components):
    data[f'CIR_PC{i+1}'] = cir_reduced[:, i]

# Drop original CIR columns (optional, to save memory)
data.drop(columns=cir_columns, inplace=True)