In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

In [2]:
class Attendance:
    """
    Extracts data given a directory containing reference and transient datasets as .txt files.
    
    :param string directory:
        The directory in which all of the recorded datasets are stored.
        Default is "../streams".
    :param string reference_file:
        The name of the file intended to be used as the reference dataset.
        Default is "podium.txt".
    """
    def __init__(self, directory="streams", reference_file="podium.txt"):
        self.directory = directory
        self.reference_file = reference_file
        self.reference_data = self.dictify_data(self.reference_file)
        self.operational_data = self.dictify_op_data()
        self.location_scores = self.calc_loc_scores()


    def dictify_data(self, file):
        """
        Called by constructor to create reference data attribute.
        Pandas Documentation: https://pandas.pydata.org/pandas-docs/stable/reference/index.html
        
        :param string file:
            The name of the file that contains data to be cleaned.
            Must be inside of the object's directory
            e.g. "podium.txt" or "first_row.txt".

        :return:
            A dictionary containing MAC addresses as keys and their respective received signal 
            strength indicators as values. {"a1:b2:c3:d4:e5:f6": 0, ... "f6:e5:d4:c3:b2:a1": -95}
        :rtype: dict
        """
            
        # use the object's directory attribute, and function parameter for file
        file_path = f"{self.directory}/{file}"
        # convert the text data into a dataframe, while preserving first row
        df = pd.read_csv(file_path, header=None)
        df = df.iloc[:, 3:-1] # drop device id, latitude, longitude & date columns
        df = pd.concat([ # enter hacky list comprehension pandas concatenation
            df.iloc[:, col:col+2].rename(columns={ # index mac address and rssi column pairs
                col+3: "mac_address", # index for the current mac address column
                col+4: "average_rssi" # index for the current rssi column
            }) # the column indexes from the result previous iloc, starting at 3,
            # the columns could be aligned with some transposes and resetting the index instead
            for col in range(0, len(df.columns), 2) # perform concatenation for every pair
        ], ignore_index=True)

        df.replace(0, -95, inplace=True) # replace all zero values with the -95 sensitivity
        # group by mac address and calculate the average rssi for each unique mac address
        df = df.groupby("mac_address", as_index=False)["average_rssi"].mean()
        
        try: # try except since attribute will not exist if reference data hasn't been dictified
            if self.reference_data: # only enters block for operational data
                # select only indexes where mac address is in reference dictionary keys
                df = df[df["mac_address"].isin(self.reference_data)]
                if len(df.index) != len(self.reference_data): # enters block if any indexes
                    indexes = []                              # were missing from dataframe
                    for mac in self.reference_data: # check which mac address is missing
                        if not df["mac_address"].str.contains(mac).any(): # if missing
                            indexes.append([mac, -95]) # add corresponding mac and sensitivity
                    # append newly created dataframe from missing indexes to complete it
                    df = df.append(pd.DataFrame(indexes, columns=["mac_address", "average_rssi"]))
        except:
            pass # no handling needed
        
        # sort by highest average rssi
        df.sort_values(by="average_rssi", ascending=False, inplace=True, ignore_index=True)
        df = df.head(20) # select the top 20 indexes after sorted
        
        # set the index column to the mac address and select the average rssi column,
        # allowing to_dict() to return in {mac_address: average_rssi} format
        dictified = df.set_index("mac_address")["average_rssi"].to_dict()
        
        return dictified # return completed dictionary


    def dictify_op_data(self):
        """
        Called by constructor to create the operational data attribute.
        OS Documentation: https://docs.python.org/3/library/os.html
        Pandas Documentation: https://pandas.pydata.org/pandas-docs/stable/reference/index.html
        
        :return:
            A 2-D dictionary containing file names as keys with dictionaries containing 
            MAC addresses and their respective received signal strength indicators as values. 
            {"file_1.txt": {"a1:b2:c3:d4:e5:f6": 0,  ... "f6:e5:d4:c3:b2:a1": -95}, ...
             "file_n.txt": {"b2:c3:d4:e5:f6:a1": -45,  ... "c3:d4:e5:f6:a1:b2": -90}}
        :rtype: dict
        """
        
        dictified = {}
        
        for root, subdirs, files in os.walk(self.directory): # walk through all subdirs and files
            for file in files: # apply to all files found in the walkthrough
                # select only txt files and not the specified reference file.
                if file.endswith(".txt") and file != self.reference_file: 
                    file_path = os.path.join(root, file) # full file path including the root
                    # obtain relative path by removing root directory
                    rel_path = os.path.relpath(file_path, self.directory)
                    # convert the text data into a dataframe, while preserving first row
                    df = pd.read_csv(file_path, header=None)
                    # create new dictionary key value pair matching format
                    # {rel_path: {mac_address: average_rssi}}
                    dictified[rel_path] = self.dictify_data(rel_path)
                
        return dictified # return completed dictionary


    def calc_loc_scores(self):
        """
        Called by constructor to create the location scores attribute.
        Calculates the Euclidian Distance between reference and operational data.
        NumPy Documentation: https://numpy.org/doc/stable/reference/index.html
        
        :return:
            A dictionary containing file names as keys and their respective calculated
            euclidian distances as values. {"file_1.txt": 50.125, ... "file_n": 93.725}
        :rtype: dict
        """
        
        location_scores = {}

        # create numpy array with list comprehension, sorting keys, then adding values to list
        ref = np.array([self.reference_data[mac] for mac in sorted(self.reference_data)])
        for file, ms_pair in self.operational_data.items(): # iterate through op data dictionary
            # create numpy array with list comp, for n'th operational dataset to compare with ref
            op = np.array([ms_pair[mac] for mac in sorted(ms_pair)])
            # calculate euclidian distance with numpy's euclidian norm method
            euclidist = np.linalg.norm(ref - op)
            # add entry to return dictionary in format {file_name: euclidian_distance}
            location_scores[file] = euclidist
        
        return location_scores # return completed dictionary

In [3]:
att = Attendance("input", "instructor.txt")

In [4]:
att.reference_data

{'5c:5a:c7:6e:d9:24': -58.84705882352941,
 '5c:5a:c7:6e:d9:23': -59.15294117647059,
 '5c:5a:c7:6e:d9:22': -59.21176470588235,
 '5c:5a:c7:6e:d9:21': -59.305882352941175,
 '5c:5a:c7:6e:d9:20': -61.23529411764706,
 '5c:5a:c7:6e:d9:2f': -61.55294117647059,
 '5c:5a:c7:6e:d9:2d': -61.65882352941176,
 '5c:5a:c7:6e:d9:2b': -61.67058823529412,
 '5c:5a:c7:6e:d9:2c': -61.68235294117647,
 '5c:5a:c7:6e:d9:2e': -61.694117647058825,
 '5c:5a:c7:6b:76:cf': -66.67058823529412,
 '5c:5a:c7:6b:76:cc': -66.82352941176471,
 '5c:5a:c7:6b:76:cb': -66.85882352941177,
 '5c:5a:c7:6b:76:cd': -66.88235294117646,
 '5c:5a:c7:6b:76:ce': -66.89411764705882,
 'b0:39:56:f8:10:7a': -67.8,
 'b0:0c:d1:07:31:41': -69.68235294117648,
 '5c:5a:c7:6b:76:c3': -70.27058823529411,
 '62:6d:c7:29:d1:a5': -70.32941176470588,
 '5c:5a:c7:6b:76:c4': -70.41176470588235}

In [5]:
att.operational_data

{'inside/second_row.txt': {'5c:5a:c7:6e:d9:2f': -43.285714285714285,
  '5c:5a:c7:6e:d9:2b': -43.642857142857146,
  '5c:5a:c7:6e:d9:2c': -43.714285714285715,
  '5c:5a:c7:6e:d9:2d': -43.714285714285715,
  '5c:5a:c7:6e:d9:2e': -44.0,
  '5c:5a:c7:6e:d9:24': -45.0,
  '5c:5a:c7:6e:d9:23': -45.07142857142857,
  '5c:5a:c7:6e:d9:21': -45.07142857142857,
  '5c:5a:c7:6e:d9:22': -45.07142857142857,
  '5c:5a:c7:6e:d9:20': -45.785714285714285,
  'b0:0c:d1:07:31:41': -61.357142857142854,
  '5c:5a:c7:6b:76:cd': -76.92857142857143,
  '5c:5a:c7:6b:76:cb': -77.0,
  '5c:5a:c7:6b:76:cc': -77.0,
  '5c:5a:c7:6b:76:cf': -77.0,
  '5c:5a:c7:6b:76:ce': -77.07142857142857,
  '5c:5a:c7:6b:76:c3': -78.42857142857143,
  '5c:5a:c7:6b:76:c4': -78.78571428571429,
  '62:6d:c7:29:d1:a5': -78.85714285714286,
  'b0:39:56:f8:10:7a': -79.21428571428571},
 'inside/third_row_right.txt': {'5c:5a:c7:6e:d9:2f': -48.7,
  '5c:5a:c7:6e:d9:2b': -49.4,
  '5c:5a:c7:6e:d9:2c': -49.4,
  '5c:5a:c7:6e:d9:2d': -49.4,
  '5c:5a:c7:6e:d9:2e': 

In [6]:
att.location_scores

{'inside/second_row.txt': 59.7872608586255,
 'inside/third_row_right.txt': 39.023606997741595,
 'inside/fourth_row_right.txt': 38.11172097482883,
 'inside/fourth_row_left.txt': 47.18995981695632,
 'inside/room_center.txt': 39.533510426271945,
 'inside/third_row_left.txt': 57.234478377727704,
 'inside/first_row_right.txt': 32.92776681214981,
 'inside/first_row_left.txt': 41.49335483229382,
 'outside/at_the_door.txt': 19.935480272680216,
 'outside/classroom_2044.txt': 105.12219876644109,
 'outside/cs_dept.txt': 99.72432634948848,
 'outside/classroom_2003.txt': 120.61572546305005,
 'outside/study_room.txt': 108.33825361907877,
 'outside/red_chairs.txt': 53.84855970187953}