# Case Study 1 (Working file)

## Imports

In [143]:
# imports
import pandas as pd
import matplotlib.pyplot as plt

## Create DataFrame

In [144]:
column_names = ['time', 'scanMac', 'posX', 'posY', 'posZ', 'orientation', 'mac', 'signal', 'channel', 'type']

lines = None
with open('../Data/offline.final.trace.txt', 'r') as file:
    lines = file.readlines()
    file.close()

instances = [] # will hold final data for dataframe

for line in lines:
    
    # strip "\n" from line
    line = line.rstrip('\n')
    
    # skip if comment
    if line[0] == '#':
        continue

    base = [] # [t, id, x, y, z, degree]
    rows = [] # base + [mac, signal, channel, type]

    for keyvalue in line.split(';'):
        key, value = keyvalue.split('=')

        if key in ['t', 'id', 'degree']:
            base.append(value)
        elif key == 'pos':
            # pos (x, y, z)
            base += value.split(',')
        else:
            # mac addresses and metrics (signal, channel, type)
            row = base.copy()
            row.append(key)
            row += value.split(',')
            rows.append(row)
    
    instances += rows

df = pd.DataFrame(instances, columns = column_names)

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1181628 entries, 0 to 1181627
Data columns (total 10 columns):
time           1181628 non-null object
scanMac        1181628 non-null object
posX           1181628 non-null object
posY           1181628 non-null object
posZ           1181628 non-null object
orientation    1181628 non-null object
mac            1181628 non-null object
signal         1181628 non-null object
channel        1181628 non-null object
type           1181628 non-null object
dtypes: object(10)
memory usage: 90.2+ MB
None


Unnamed: 0,time,scanMac,posX,posY,posZ,orientation,mac,signal,channel,type
0,1139643118358,00:02:2D:21:0F:33,0.0,0.0,0.0,0.0,00:14:bf:b1:97:8a,-38,2437000000,3
1,1139643118358,00:02:2D:21:0F:33,0.0,0.0,0.0,0.0,00:14:bf:b1:97:90,-56,2427000000,3
2,1139643118358,00:02:2D:21:0F:33,0.0,0.0,0.0,0.0,00:0f:a3:39:e1:c0,-53,2462000000,3
3,1139643118358,00:02:2D:21:0F:33,0.0,0.0,0.0,0.0,00:14:bf:b1:97:8d,-65,2442000000,3
4,1139643118358,00:02:2D:21:0F:33,0.0,0.0,0.0,0.0,00:14:bf:b1:97:81,-65,2422000000,3


## Exploratory Data Analysis (EDA)

In [145]:
df['mac'].value_counts()

00:0f:a3:39:e1:c0    145862
00:0f:a3:39:dd:cd    145619
00:14:bf:b1:97:8a    132962
00:14:bf:3b:c7:c6    126529
00:14:bf:b1:97:90    122315
00:14:bf:b1:97:8d    121325
00:14:bf:b1:97:81    120339
02:00:42:55:31:00    103887
02:64:fb:68:52:e6     50852
00:0f:a3:39:e0:4b     43508
02:2e:58:22:f1:ac     25112
00:0f:a3:39:e2:10     19162
02:37:fd:3b:54:b5      8732
02:b7:00:bb:a9:35      7602
02:5c:e0:50:49:de      6997
00:04:0e:5c:23:fc       418
00:30:bd:f8:7f:c5       301
00:e0:63:82:8b:a9       103
02:0a:3d:06:94:88         1
02:4f:99:43:30:cd         1
02:42:1c:4e:b5:c0         1
Name: mac, dtype: int64

In [146]:
df.describe()

Unnamed: 0,time,scanMac,posX,posY,posZ,orientation,mac,signal,channel,type
count,1181628,1181628,1181628.0,1181628.0,1181628.0,1181628.0,1181628,1181628,1181628,1181628
unique,146074,1,34.0,14.0,1.0,203.0,21,75,10,2
top,1139648662194,00:02:2D:21:0F:33,1.0,3.0,0.0,90.3,00:0f:a3:39:e1:c0,-59,2462000000,3
freq,21,1181628,109816.0,241614.0,1181628.0,26847.0,145862,40878,189774,978443


In [147]:
macs_to_keep = ['00:0f:a3:39:e1:c0',    
            '00:0f:a3:39:dd:cd', 
            '00:14:bf:b1:97:8a',
            '00:14:bf:3b:c7:c6',    
            '00:14:bf:b1:97:90',  
            '00:14:bf:b1:97:8d',
            '00:14:bf:b1:97:81',
            '02:00:42:55:31:00']
df = df[df.mac.isin(macs_to_keep)].copy()

In [148]:
df.posX.unique()

array(['0.0', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0',
       '9.0', '10.0', '11.0', '12.0', '13.0', '14.0', '15.0', '16.0',
       '17.0', '18.0', '19.0', '20.0', '21.0', '22.0', '23.0', '24.0',
       '25.0', '26.0', '27.0', '28.0', '29.0', '30.0', '31.0', '32.0',
       '33.0'], dtype=object)

In [149]:
df.posY.unique()

array(['0.0', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0',
       '9.0', '10.0', '11.0', '12.0', '13.0'], dtype=object)

In [150]:
df.orientation.unique()

array(['0.0', '45.2', '90.3', '135.9', '180.3', '225.6', '270.1', '315.0',
       '45.1', '90.7', '135.2', '180.1', '225.7', '269.7', '315.3', '0.3',
       '45.5', '90.5', '135.7', '180.2', '225.0', '270.9', '315.2', '0.7',
       '45.7', '89.8', '225.2', '269.9', '0.4', '90.4', '134.8', '180.4',
       '0.2', '45.6', '135.0', '179.6', '225.4', '269.5', '179.2',
       '224.5', '89.9', '134.4', '269.4', '359.9', '90.1', '135.1',
       '270.2', '314.8', '313.8', '45.8', '90.8', '134.9', '225.9',
       '270.5', '315.9', '45.4', '134.7', '180.0', '224.9', '269.8',
       '314.4', '0.1', '44.3', '225.3', '315.7', '0.5', '44.8', '89.4',
       '226.0', '315.1', '0.8', '89.2', '180.8', '224.8', '315.8', '44.4',
       '180.5', '314.9', '44.9', '179.8', '315.5', '135.6', '0.6',
       '135.4', '179.9', '225.1', '270.0', '45.3', '270.4', '45.0',
       '90.6', '46.0', '0.9', '44.6', '135.3', '90.2', '134.2', '44.5',
       '134.5', '226.1', '269.6', '270.3', '315.4', '1.5', '135.8',
       

In [151]:
def round_angle(angle):
    buckets = [angle for angle in angle_buckets()]
    angle = angle if angle <= 360 else angle-360
    angles = [abs(a - angle) for a in buckets]
    sorted_angles = angles.copy()
    sorted_angles.sort()
    min_val = sorted_angles[0]
    min_index = angles.index(min_val)
    return buckets[min_index]
    

In [152]:
def angle_buckets(start = 0, end = 360, step = 45):
     for i in range(0, int(round(end/step, 0)) + 1):
            val = start + i*step
            yield val

In [153]:
df.orientation = pd.to_numeric(df['orientation'])
df['mapped_orientation'] = df['orientation'].apply(lambda x: round_angle(x))

In [154]:
df.signal = pd.to_numeric(df.signal)

In [155]:
df['xy-loc'] = df['posX'] + '-' + df['posY']
len(df['xy-loc'].unique())

166

In [156]:
df.drop(columns=['scanMac'], inplace=True)

In [157]:
grouped_df = df.groupby(['mac', 'xy-loc'])['signal'].agg(['mean']).reset_index()
pd.pivot_table(grouped_df, values='mean', columns='mac', index=['xy-loc'])

mac,00:0f:a3:39:dd:cd,00:0f:a3:39:e1:c0,00:14:bf:3b:c7:c6,00:14:bf:b1:97:81,00:14:bf:b1:97:8a,00:14:bf:b1:97:8d,00:14:bf:b1:97:90,02:00:42:55:31:00
xy-loc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0-0.0,-69.795249,-53.466063,-66.283920,-64.481534,-38.769627,-65.925656,-56.286822,-83.453469
0.0-1.0,-70.963636,-52.905575,-66.176697,-65.727273,-39.730000,-65.495640,-58.031169,-82.350669
0.0-10.0,-68.944820,-55.259681,-65.058583,-66.400564,-45.434293,-65.752941,-52.087742,-85.358044
0.0-11.0,-70.609040,-54.170648,-67.967568,-69.018717,-47.936446,-66.568282,-55.207908,-85.418312
0.0-12.0,-68.615819,-54.450000,-68.135027,-70.879947,-45.597744,-68.276946,-53.444727,-86.868009
0.0-13.0,-72.922034,-54.841090,-70.825255,-72.200269,-45.421594,-68.527820,-55.123410,-87.102908
0.0-2.0,-70.230596,-55.886621,-62.138639,-62.127941,-43.657500,-62.763456,-55.169492,-83.052469
0.0-3.0,-69.584081,-55.543182,-63.276680,-55.994413,-41.111524,-62.122020,-54.367454,-79.275000
0.0-4.0,-67.906040,-52.046644,-64.260377,-59.326027,-42.558018,-62.668056,-54.070221,-81.008052
0.0-7.0,-69.041714,-56.084091,-64.365309,-64.240713,-45.809927,-54.542254,-57.990728,-81.053279


In [162]:
from math import floor
def get_train_data_by_angle(ref_angle=225, data=df, angles=3):
    ref_angle = round_angle(ref_angle)
    assert angles <= 7
    buckets = [angle for angle in angle_buckets()]
    start_index = buckets.index(ref_angle)- int(floor(angles/2))
    keep_angles = []
    for x in range(start_index, start_index + angles):
        keep_angles.append(buckets[x])
    print(keep_angles)
    data = data[data.orientation.isin(keep_angles)].copy()
    grouped_df = data.groupby(['mac', 'xy-loc'])['signal'].agg(['mean']).reset_index()
    grouped_df = pd.pivot_table(grouped_df, values='mean', columns='mac', index=['xy-loc'])
    return grouped_df

In [167]:
get_train_data_by_angle(ref_angle=130).head().sort_values(['xy-loc'])

[90, 135, 180]


mac,00:0f:a3:39:dd:cd,00:0f:a3:39:e1:c0,00:14:bf:3b:c7:c6,00:14:bf:b1:97:81,00:14:bf:b1:97:8a,00:14:bf:b1:97:8d,00:14:bf:b1:97:90,02:00:42:55:31:00
xy-loc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0-11.0,-68.981818,-56.754545,-71.123711,-69.086957,-50.836735,-67.044944,-55.441176,-85.978022
0.0-12.0,-68.572727,-52.527273,-68.908163,-70.322222,-43.110092,-63.73494,-48.99,-83.826667
0.0-4.0,-65.535714,-52.763636,-64.817391,-61.898876,-42.61,-61.534091,-53.854167,-77.597938
1.0-3.0,-73.783784,-54.563636,-62.955556,-53.97619,-41.666667,-56.846154,-58.967742,-80.927711
1.0-9.0,-68.261261,-50.290909,-64.509615,-63.277778,-51.210526,-63.060241,-49.54,-87.918367


In [160]:
get_train_data_by_angle().head()

[180, 225, 270]


mac,00:0f:a3:39:dd:cd,00:0f:a3:39:e1:c0,00:14:bf:3b:c7:c6,00:14:bf:b1:97:81,00:14:bf:b1:97:8a,00:14:bf:b1:97:8d,00:14:bf:b1:97:90,02:00:42:55:31:00
xy-loc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0-12.0,-71.62037,-50.136364,-62.929293,-65.409091,-44.92233,-73.012346,-52.20202,-87.744681
0.0-13.0,-73.657658,-56.163636,-71.085106,-70.879121,-44.855769,-69.202247,-57.287129,-87.984615
0.0-4.0,-65.535714,-52.763636,-64.817391,-61.898876,-42.61,-61.534091,-53.854167,-77.597938
0.0-8.0,-63.857143,-47.409091,-65.645833,-58.0375,-47.528302,-47.476744,-57.938776,-79.217391
1.0-10.0,-66.318182,-51.8,-66.72449,-68.292929,-53.366337,-66.494253,-58.029703,-81.567901


In [168]:
from sklearn.cluster import SpectralClustering
def run_spectral_clustering(train
                           , cols=['longitude', 'latitude']
                           , clusters=[5]
                           , affinity_types=['nearest_neighbors']
                           , eigen_solvers=['arpack']
                           , show_map=True):

    assert(len(cols) == 2)
    results = {}
    for n_clusters in clusters:
            for affinity_type in affinity_types:
                for solver in eigen_solvers:
                    X = train[cols].copy()
                    spc = SpectralClustering(n_clusters=n_clusters
                                             , affinity = affinity_type
                                             , random_state=random_state
                                            , n_jobs=-1
                                            , eigen_solver=solver)
                    spec_fit = spc.fit(X)
                    new_feature = spc.labels_
                    y = y_train.values.ravel() # target 
                    X = X_train.drop(columns=cols) #drop the cols that we are using for clustering  
                    X = np.column_stack((X, pd.get_dummies(new_feature))) # new feature set
                    acc = cross_val_score(classifier
                                          ,X 
                                          ,y=y
                                          ,cv=cv)
                    results.update({(affinity_type, n_clusters, solver):{
                                        'cluster': spc,
                                        'solver': solver,
                                        'spec_fit':spec_fit,
                                        'xform_data':X,
                                        'target': y,
                                        'results':acc,
                                        'clusters':n_clusters}
                                   })
                    if show_map:
                        plt.scatter(X_train[cols[0]]
                                    , X_train[cols[1]]
                                    , c=new_feature
                                    , cmap=plt.cm.rainbow
                                    , s=5
                                    , linewidths=0)
                        print (f'Average accuracy = {acc.mean()*100}+-{acc.std()*100}')
                        plt.title(f'Affinity Matrix Type: {affinity_type} Clusters:{n_clusters} Cols:{cols}')
                        plt.show()
    return results 

In [170]:
SpectralClustering?