### Author: `Winston Menzies`
> #### Imperial College Business School
> #### Professional Certificate in Machine Learning and Artifical Intelligence
#### Date: `18 January 2024`
#### Student ID: `484 (Class of Sep-2023)`
#### Function: `Load Capstone Results`
#### Usage: \<TODO\>

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
from tabulate import tabulate

In [2]:
DATA_PATH = './data/'
DATA_DIR_PREFIX = 'week_'
DATA_STUDENT_ID_SEPARATOR1 = '-'
DATA_STUDENT_ID_SEPARATOR2 = '_'
DATA_TYPE_OBSERVATIONS = 'observations'
DATA_TYPE_QUERIES = 'queries'
DATA_FILENAME = 'capstone_results.csv'
INDEX_STUDENT_FILE = 1
INDEX_WEEK_NUM = 0
INDEX_STUDENT_ID = 0
INDEX_DATA_TYPE = 1
INDEX_DATA_FILENAME = 2
INDEX_FUNCTION_NUMBER = 2
INDEX_FUNCTION_VALUE = 3
TOTAL_FUNCTIONS = 8
INDEX_INPUT_VALUE = 2
INDEX_OUTPUT_VALUE = TOTAL_FUNCTIONS + INDEX_FUNCTION_VALUE
TOTAL_COLUMNS = INDEX_OUTPUT_VALUE + 1
BLANK_VALUE = None
BLANK_SPACE = ' '*6

In [30]:
class CapstoneResultLoader(object):
    """
    Load Capstone Results
    Collate students queries and observations
    """
    def __init__(self):
        self._file_list = [] #store result files
        self._data_observations = [] #store observations
        self._data_queries = [] #store queries
        self._data_results = [] #store queries + observations 
        self._data_header = ["Week", "Student", "Fn", 
                             "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", 
                             "Y"]
        self.initialisation()
        
    def initialisation(self):
        #This method is responsible for initializing the object by calling other methods
        self._load_data()
        self._load_data_result()
        self._format_data_result()
        
    def get_data_header(self):
        return self._data_header
        
    def get_data_observations(self):
        return self._data_observations
        
    def get_data_queries(self):
        return self._data_queries
        
    def get_data_results(self):
        return self._data_results
        
    def get_data_size(self):
        dim = self._data_results.shape
        print(f'Rows = {dim[0]} Cols = {dim[1]}')
        
    def get_file_list(self):
        return self._file_list
        
    def get_student_list(self):
        list = []
        for i in range(len(self._data_queries)):
            student = self._data_queries[i][INDEX_STUDENT_FILE]
            if list.count(student) == 0:
                list.append(student)
        return list
        
    def get_week_list(self):
        list = []
        for i in range(len(self._data_queries)):
            week = self._data_queries[i][INDEX_WEEK_NUM]
            if list.count(week) == 0:
                list.append(week)
        return list
        
    def convert_to_pandas(self):
        df = pd.DataFrame(self._data_results, columns=self._data_header)
        return df
        
    def print_results(self):
        self._print_table_results(self._data_results)
        
    def print_function_results(self, num):
        list = []
        for i in range(len(self._data_results)):
            result = self._data_results[i]
            if result[INDEX_FUNCTION_NUMBER] == num:
                list.append(self._data_results[i])
        self._print_table_results(list)
        
    def print_student_results(self, num):
        list = []
        for i in range(len(self._data_results)):
            result = self._data_results[i]
            if result[INDEX_STUDENT_FILE] == num:
                list.append(self._data_results[i])
        self._print_table_results(list)
        
    def save_results(self):
        df = self.convert_to_pandas()
        df.to_csv(DATA_FILENAME, index=False)
        print(f'The capstone results {DATA_FILENAME} has successfully saved.') 


    def _load_data(self):
        #loads data files from a specified directory, and it checks if the week number is 1 before processing

        for path in pathlib.Path(DATA_PATH).iterdir():
            if path.is_dir():
                pos = path.name.find(DATA_STUDENT_ID_SEPARATOR2)
                if pos == -1:
                   data_dir = path.name
                else:
                    data_dir = path.name[pos+1:pos+3]
                
                print(f'data_dir= {data_dir}')
                # Check if the week number is 0 as the working week
                if data_dir == DATA_DIR_PREFIX + '00':
                    data_file = self._process_directory(path)
                    self._file_list.append([data_dir, data_file])
                    
        # Print out the file names
        print(self._file_list)
"""       
        # Load data files only from the "week_00" folder
        week_00_path = pathlib.Path(DATA_PATH) / "week_00"
    
        if week_00_path.is_dir():
            data_files = self._process_directory(week_00_path)
            self._file_list.append(["week_00", data_files])
    
        # Print out the file names
        print(self._file_list)
""" 

                
    def _load_data_observation(self, filename):
        #reads and parses observation data from a file
        observation = open(filename).read()
        data = np.fromstring(observation[1:-1], dtype=float, sep=',')
        return data
    
    def _load_data_query(self, filename):
        #reads and parses query data from a file
        data = open(filename).read()[1:-1]
        list = data.replace('\n', '').split("),")
        data = []
        for i in range(len(list)):
            open_bracket_pos = list[i].find('[')
            end_bracket_pos = list[i].find(']')
            query = list[i][open_bracket_pos+1:end_bracket_pos]
            data.append(np.fromstring(query, dtype=float, sep=','))
        return data

    def _load_data_result(self):
        #orchestrates the loading of queries and observations from the data files
        query_len = 0
        for i in range(len(self._file_list)):
            for j in range(len(self._file_list[i][INDEX_STUDENT_FILE])):
                student_file = self._file_list[i][INDEX_STUDENT_FILE][j]
                if student_file[INDEX_DATA_TYPE] == DATA_TYPE_OBSERVATIONS:
                    self._data_observations.append(self._load_data_observation(student_file[INDEX_DATA_FILENAME]))
                else:
                    queries = self._load_data_query(student_file[INDEX_DATA_FILENAME])
                    if len(queries) > query_len:
                        query_len = len(queries)
                    self._data_queries.append([self._file_list[i][INDEX_WEEK_NUM], 
                                               student_file[INDEX_STUDENT_ID], 
                                               queries])

    def _process_directory(self, dir):
        list = []
        for file in pathlib.Path(dir).iterdir():
            if file.is_file():
                list.append(self._format_data_file(file))
        return list

    def _print_table_results(self, list):
        table = []
        headers=["Week", "Student", "Fn", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "Y"]
        for i in range(len(list)):
            result = list[i]
            values = []
            for j in range(len(result)):
                if result[j] == BLANK_VALUE:
                    values.append(BLANK_SPACE)
                else:
                    if j > INDEX_FUNCTION_NUMBER:
                        values.append(f'{result[j]:9.4f}')
                    else:
                        values.append(result[j])
            table.append(values)
        print(tabulate(table, headers))
        
    def _format_data_result(self):
        #This method formats and processes the loaded data, combining queries and observations into a structured format.
        #It adjusts to handle varying numbers of queries for each student
        
        self._data_results = np.full((len(self._data_queries) * TOTAL_FUNCTIONS, TOTAL_COLUMNS),
                                     fill_value = BLANK_VALUE)
        
        # Print some debug information
        print(f"Total Queries: {len(self._data_queries)}")
        print(f"Total Observations: {len(self._data_observations)}")
        
        for i in range(len(self._data_queries)):
            queries = self._data_queries[i][INDEX_INPUT_VALUE]
            
            # Print some debug information
            print(f"Student {i + 1} - Total Queries: {len(queries)}")
            
            for function in range(len(queries)):
                values = queries[function]
                
                # Print some debug information
                print(f"Query {function + 1} - Values: {values}")
                
                self._data_results[(i * TOTAL_FUNCTIONS) + function, INDEX_FUNCTION_NUMBER] = function + 1
                self._data_results[(i * TOTAL_FUNCTIONS) + function, INDEX_OUTPUT_VALUE] = self._data_observations[i][function]
                 # Adjust the code to handle varying numbers of queries
                for k in range(len(values)):
                    self._data_results[(i * TOTAL_FUNCTIONS) + function, INDEX_WEEK_NUM] = self._data_queries[i][INDEX_WEEK_NUM]
                    self._data_results[(i * TOTAL_FUNCTIONS) + function, INDEX_WEEK_NUM+1] = self._data_queries[i][INDEX_WEEK_NUM+1]
                    self._data_results[(i * TOTAL_FUNCTIONS) + function, k + INDEX_FUNCTION_VALUE] = float(values[k])

    def _format_data_file(self, filename):
        #This method extracts student ID and data type information from the filename of a data file
        student_id = ""
        data_type = ""
        pos = filename.name.find(DATA_STUDENT_ID_SEPARATOR1)
        if pos == -1:
            pos = filename.name.find(DATA_STUDENT_ID_SEPARATOR2)
            if pos != -1:
                student_id = int(filename.name[0:pos])
        else:
            student_id = int(filename.name[0:pos])
        pos = filename.name.find(DATA_TYPE_OBSERVATIONS)
        if pos != -1:
            data_type = DATA_TYPE_OBSERVATIONS
        else:
            data_type = DATA_TYPE_QUERIES
        return [student_id, data_type, filename]


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 118)

In [25]:
loader = CapstoneResultLoader()

AttributeError: 'CapstoneResultLoader' object has no attribute '_format_data_file'

In [5]:
loader.print_results()

NameError: name 'loader' is not defined

In [33]:
loader.print_student_results(484)

  Week    Student    Fn      X1      X2  X3      X4      X5      X6      X7      X8              Y
------  ---------  ----  ------  ------  ------  ------  ------  ------  ------  ------  ---------
    01        484     1  0.045   0.69                                                       0
    01        484     2  0.7841  0.9619                                                     0.1746
    01        484     3  0.4667  0.1111  0.5333                                            -0.0648
    01        484     4  0.4375  0.4375  0.3750  0.4375                                     0.1608
    01        484     5  0.25    0.875   0.8750  0.8750                                  1182.07
    01        484     6  0.1111  0.2222  0.2222  0.0000  0.0000                            -1.8112
    01        484     7  0       0.4     0.4000  0.2000  0.4000  0.8000                     1.6104
    01        484     8  0       0.3333  0.0000  0.3333  0.6667  0.3333  0.3333  0.3333     9.7827


In [34]:
loader.print_function_results(1)

  Week    Student    Fn      X1      X2  X3    X4    X5    X6    X7    X8          Y
------  ---------  ----  ------  ------  ----  ----  ----  ----  ----  ----  -------
    01        428     1  0.6285  0.6285                                       2
    01        434     1  0.65    0.65                                         0.1957
    01        453     1  0.6     0.69                                        -0.0004
    01        464     1  0.65    0.65                                         0.1957
    01        484     1  0.045   0.69                                         0


In [35]:
loader.convert_to_pandas()

Unnamed: 0,Week,Student,Fn,X1,X2,X3,X4,X5,X6,X7,X8,Y
0,1,428,1,0.62854,0.62854,,,,,,,2.0
1,1,428,2,0.983984,0.999998,,,,,,,-0.143941
2,1,428,3,0.432432,0.256757,0.500001,,,,,,-0.053196
3,1,428,4,0.421053,0.368421,0.368421,0.421053,,,,,0.692067
4,1,428,5,0.310345,0.827586,0.965517,0.931034,,,,,1863.356016
5,1,428,6,0.000123,0.142857,0.142857,0.999998,0.000123,,,,-1.522572
6,1,428,7,0.000123,0.444444,0.222222,0.222222,0.444444,0.777778,,,1.170907
7,1,428,8,0.000123,0.222222,0.222222,0.222222,0.666667,0.666667,0.222222,0.666667,9.906004
8,1,434,1,0.65,0.65,,,,,,,0.195741
9,1,434,2,0.717172,0.0,,,,,,,0.568847


In [36]:
loader.get_data_results()

array([['01', 428, 1, 0.62854, 0.62854, None, None, None, None, None,
        None, 1.9999999996761817],
       ['01', 428, 2, 0.983984, 0.999998, None, None, None, None, None,
        None, -0.14394130977725078],
       ['01', 428, 3, 0.432432, 0.256757, 0.500001, None, None, None,
        None, None, -0.05319638367835561],
       ['01', 428, 4, 0.421053, 0.368421, 0.368421, 0.421053, None, None,
        None, None, 0.692066924633068],
       ['01', 428, 5, 0.310345, 0.827586, 0.965517, 0.931034, None, None,
        None, None, 1863.3560161111557],
       ['01', 428, 6, 0.000123, 0.142857, 0.142857, 0.999998, 0.000123,
        None, None, None, -1.5225719401914466],
       ['01', 428, 7, 0.000123, 0.444444, 0.222222, 0.222222, 0.444444,
        0.777778, None, None, 1.170907408098889],
       ['01', 428, 8, 0.000123, 0.222222, 0.222222, 0.222222, 0.666667,
        0.666667, 0.222222, 0.666667, 9.9060036265316],
       ['01', 434, 1, 0.65, 0.65, None, None, None, None, None, None,
    

In [37]:
loader.get_data_size()

Rows = 40 Cols = 12


In [38]:
loader.get_data_header()

['Week', 'Student', 'Fn', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'Y']

In [39]:
loader.save_results()

The capstone results capstone_results.csv has successfully saved.
