In [197]:
#* project_deliverable_2.ipynb
#*
#* ANLY 555 2023
#* Project <>
#*
#* Due on: 10/04/2023
#* Author(s): Landon Carpenter
#*
#*
#* In accordance with the class policies and Georgetown's
#* Honor Code, I certify that, with the exception of the
#* class resources and those items noted below, I have neither
#* given nor received any assistance on this project other than
#* the TAs, professor, textbook and teammates.
#*

import csv
import nltk
import numpy as np
import matplotlib.pyplot as plt
import os
import wordcloud

#create dataset class
class DataSet:
    """
    Class for managing the dataset
    
    Attribute:
        filename (str): the name of the file to be read in
    """

    #constructor
    def __init__(self, filename, ):
        """
        Initializes the DataSet class
        """
        self.filename = filename
        self.data = None

    #create the framework and stubs for __readFromCSV, __load, clean, and explore
    def __readFromCSV(self, filename, header = True):
        """
        Reads in the data from a CSV file. The data is stored on a column basis similar to a parquet file to more easily account for data types.
        """
        try:
            with open(filename, 'r') as f:
                reader = csv.reader(f)
                data = list(reader)

                if header:
                    header = data[0]
                    data = data[1:]
                else:
                    header = [f"col_{i}" for i in range(len(data[0]))]

                #init the dict to store the data
                columns = {col_name: [] for col_name in header}

                for row in data:
                    for col_name, value in zip(header, row):
                        try:
                            columns[col_name].append(float(value))
                        except:
                            columns[col_name].append(value)
                            #if the value is '' then replace it with np.nan
                            if value == '':
                                columns[col_name][-1] = np.nan
                            else:
                                pass


                #ok now convert to numpy array
                d_type = [(col_name, object if any(isinstance(val, str) for val in columns[col_name]) else float) for col_name in header]
                self.data = np.array(list(zip(*[columns[col_name] for col_name in header])), dtype = d_type)

        except Exception as e:
            print(f"Error reading {filename}: {str(e)}")

                    
                        



    #abstract base class (ABC)
    def __load(self, filename):
        """
        Loads the data from a CSV file
        """
        print(f"Loading {filename}...")

        self.__readFromCSV(filename)

    def getType(self):
        """
        This function will be called later in each of the child classes to determine the type of data
        """

        #using while True to avoid infinite loop that I had earlier
        while True:
            data_type = input("Is this data Time Series, Text, Quantitative, or Qualitative? \nPlease type 'Time', 'Text', 'Quantitative', or 'Qualitative'.")
            #trying to make the prompt a little more forgiving by making the input lowercase and removing whitespace before checking for validity
            norm = data_type.lower().strip()

            #make sure the type is valid
            if norm in ['time', 'text', 'quantitative', 'qualitative']:
                return norm

            #if the type is not valid they will see this message and be prompted to try again
            else:
                print("Please enter a valid data type.")


    def clean(self):
        """
        Cleans the data
        """
        print("Cleaning...")

    def explore(self):
        """
        Explores the data
        """
        print("Exploring...")
        

#use inheritance to create TimeSeriesDataSet class
class TimeSeriesDataSet(DataSet):
    """
    Class for managing the time series dataset
    """

    #constructor
    def __init__(self, filename):
        """
        Initializes the TimeSeriesDataSet class
        """
        super().__init__(filename)
    #override the clean and explore methods from the DataSet class to be specific to the TimeSeriesDataSet class
    def clean(self):
        """
        Cleans the time series data set
        """
        print("Cleaning Time Series Data Set...")
    def explore(self):
        """
        Explores the time series data set
        """
        print("Exploring Time Series Data Set...")


class TextDataSet(DataSet):
    """
    Class for managing the text dataset. 
    """

    def __init__(self, filename):
        """
        Initializes the TextDataSet class
        """
        super().__init__(filename)

    def clean(self):
        """
        Cleans the text data set
        """
        print("Cleaning Text Data Set...")

    def explore(self):
        """
        Explores the text data set
        """
        print("Exploring Text Data Set...")


#use inheritance to create QuantDataSet class
class QuantDataSet(DataSet):
    """
    Class for managing the quantitative dataset
    """

    #constructor
    def __init__(self, filename):
        """
        Initializes the QuantDataSet class
        """
        super().__init__(filename)
        self.filename = filename
        self.data = None

    #override the clean and explore methods from the DataSet class to be specific to the QuantDataSet class
    def clean(self, header = True):
        """
        Cleans the quantitative data set
        """
        try: 
            if self.data is None:
                self._DataSet__load(self.filename)

            print("Cleaning Quant Data Set...")

            #iterate by column replacing missing values with the mean
            for col_name in self.data.dtype.names:
                col_data = self.data[col_name]

                #if the data is numeric
                if np.issubdtype(col_data.dtype, np.number):
                    #replace missing values with the mean limited to 2 decimal places
                    col_data[np.isnan(col_data)] = np.round(np.nanmean(col_data), 2)
                    
        except Exception as e:
            print(f"Error cleaning {self.filename}: {str(e)}")

    def explore(self):
        """
        Explores the quantitative data set
        """
        print("Exploring Quant Data Set...")

#use inheritance to create QualDataSet class
class QualDataSet(DataSet):
    """
    Class for managing the qualitative dataset
    """

    #constructor
    def __init__(self, filename):
        """
        Initializes the QualDataSet class
        """
        super().__init__(filename)
    #override the clean and explore methods from the DataSet class to be specific to the QualDataSet class
    def clean(self):
        """ 
        Cleans the qualitative data set
        """
        print("Cleaning Qual Data Set...")
    def explore(self):
        """
        Explores the qualitative data set
        """
        print("Exploring Qual Data Set...")


#create class for the classifier 
class ClassifierAlgotithm:
    """
    Class for managing the classifier algorithm
    """
    def __init__(self):
        """
        Initializes the ClassifierAlgotithm class
        """
        pass

    def train(self):
        """
        Trains the classifier algorithm
        """
        print("Training...")

    def test(self):
        """
        Tests the classifier algorithm
        """
        print("Testing...")

#create class for simpe KNN that inherets from ClassifierAlgotithm
class simpleKNNClassifier(ClassifierAlgotithm):
    """
    Class for managing the simple KNN classifier
    """
    def __init__(self):
        """
        Initializes the simpleKNNClassifier class
        """
        super().__init__()

#create class for kdTree KNN that inherets from ClassifierAlgotithm
class kdTreeKNNClassifier(ClassifierAlgotithm):
    """
    Class for managing the kdTree KNN classifier
    """
    def __init__(self):
        super().__init__()

#create the Experiment class that will run cross validation, get a score given k and, and create a confusion matrix
class Experiment:
    """
    Class for managing the experiment
    """

    def __init__(self):
        """
        Initializes the Experiment class
        """
        pass

    def runCrossVal(self, k):
        """
        Runs k-fold cross validation

        Args:
            k (int): the number of folds to use
        """
        print(f"Running {k}-fold cross validation...")

    def score(self):
        """
        Scores the experiment
        """
        print("Scoring...")

    def __confusionMatrix(self):
        """
        Creates a confusion matrix
        """
        print("Creating confusion matrix...")

#main function: if the py file is ran directly then run the main function and perform the following (rather than importing the file and using a function, for example)
if __name__ == "__main__":
    print("Running main to show framework and stubs...")
    print("\n")
    print("---Creating dataset---")
    dataset = DataSet("my_data.csv")
    dataset._DataSet__readFromCSV("my_data.csv")
    dataset._DataSet__load("my_data.csv")
    dataset.clean()
    dataset.explore()
    print("\n")
    
    print("---Creating TimeSeriesDataSet--- ")
    ts_dataset = TimeSeriesDataSet("my_data.csv")
    ts_dataset._DataSet__readFromCSV("my_data.csv")
    ts_dataset._DataSet__load("my_data.csv")
    ts_dataset.clean()
    ts_dataset.explore()
    print("\n")

    print("---Creating QuantDataSet---")
    quant_dataset = QuantDataSet("my_data.csv")
    quant_dataset._DataSet__readFromCSV("my_data.csv")
    quant_dataset._DataSet__load("my_data.csv")
    quant_dataset.clean()
    quant_dataset.explore()
    print("\n")

    print("---Creating QualDataSet---")
    qual_dataset = QualDataSet("my_data.csv")
    qual_dataset._DataSet__readFromCSV("my_data.csv")
    qual_dataset._DataSet__load("my_data.csv")
    qual_dataset.clean()
    qual_dataset.explore()
    print("\n")

    print("---Creating simple KNN classifiers---")
    simple_knn = simpleKNNClassifier()
    simple_knn.train()
    simple_knn.test()
    print("\n")

    print("---Creating kdTree KNN classifiers--- ")
    kd_tree_knn = kdTreeKNNClassifier()
    kd_tree_knn.train()
    kd_tree_knn.test()
    print("\n")

    print("---Creating experiment--- ")
    experiment = Experiment()
    experiment.runCrossVal(5)
    experiment.score()
    experiment._Experiment__confusionMatrix()

Running main to show framework and stubs...


---Creating dataset---
Error reading my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Loading my_data.csv...
Error reading my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Cleaning...
Exploring...


---Creating TimeSeriesDataSet--- 
Error reading my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Loading my_data.csv...
Error reading my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Cleaning Time Series Data Set...
Exploring Time Series Data Set...


---Creating QuantDataSet---
Error reading my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Loading my_data.csv...
Error reading my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Loading my_data.csv...
Error reading my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Cleaning Quant Data Set...
Error cleaning my_data.csv: 'NoneType' object has no attribute 'dtype'
Exploring Quant Data Set...


---Crea

In [198]:
#read in "test.csv" as my_test using DataSet and __readFromCSV
test_object = DataSet("Quant_Test.csv")
vars(test_object)

{'filename': 'Quant_Test.csv', 'data': None}

In [199]:
quant_test = QuantDataSet("Quant_Test.csv")
print(f"Quant Test: {quant_test.filename}")
print(f"Quant Test: {quant_test.data}")
quant_test.clean()
print(f"Quant Test: {quant_test.filename}")
print(f"Quant Test: {quant_test.data}")
print("---Vars---")
print(vars(quant_test))
print(type(quant_test.data[0][0]))
print(type(quant_test.data[1][0]))
print(type(quant_test.data[1][1]))


Quant Test: Quant_Test.csv
Quant Test: None
Loading Quant_Test.csv...
Cleaning Quant Data Set...
Quant Test: Quant_Test.csv
Quant Test: [('P1', 11.  , 12., 10.  , 8., 13., 12., 14., 21.,  9.33, 14., 11., 14., 16., 9.,  9., 9., 3., 21., 0.44, 0.5 , 0.39, 0.28, 0.56, 0.5 , 0.61, 1.  , 0.56, 0.61)
 ('P2',  7.  ,  6., 10.33, 2.,  7.,  1.,  6.,  3.,  3.  ,  3., 11.,  2.,  6., 2.,  0., 6., 0., 10., 0.7 , 0.6 , 0.3 , 0.34, 0.7 , 0.1 , 0.6 , 0.3 , 0.3 , 0.3 )
 ('P3',  7.  , 11.,  8.  , 9., 10.,  8.,  7., 13., 12.  ,  6., 14.,  9.,  9., 7., 12., 8., 3., 14., 0.36, 0.73, 0.45, 0.55, 0.64, 0.45, 0.36, 0.91, 0.82, 0.48)
 ('P4',  8.33,  8., 13.  , 5.,  9.,  6.,  9., 13., 13.  , 11.,  8.,  4.,  5., 4., 15., 7., 2., 19., 0.59, 0.35, 0.65, 0.18, 0.41, 0.24, 0.52, 0.65, 0.56, 0.53)]
---Vars---
{'filename': 'Quant_Test.csv', 'data': array([('P1', 11.  , 12., 10.  , 8., 13., 12., 14., 21.,  9.33, 14., 11., 14., 16., 9.,  9., 9., 3., 21., 0.44, 0.5 , 0.39, 0.28, 0.56, 0.5 , 0.61, 1.  , 0.56, 0.61),
      

In [196]:
#save the cleaned data to a new csv file
np.savetxt("Quant_Test_Cleaned.csv", quant_test.data, delimiter = ",", fmt = "%s")

In [149]:
test_object._DataSet__readFromCSV(test_object.filename)

Reading from Quant_Test.csv...


In [112]:
print(test_object)

<__main__.DataSet object at 0x0000029087207A50>


In [113]:
test_object.data

array([['Product_Code', 'W0', 'W1', 'W2', 'W3', 'W4', 'W5', 'W6', 'W7',
        'W8', 'W9', 'W10', 'W11', 'W12', 'W13', 'W14', 'W15', 'MIN',
        'MAX', 'Normalized 0', 'Normalized 1', 'Normalized 2',
        'Normalized 3', 'Normalized 4', 'Normalized 5', 'Normalized 6',
        'Normalized 7', 'Normalized 8', 'Normalized 9'],
       ['P1', '11', '12', '10', '8', '13', '12', '14', '21', '', '14',
        '11', '14', '16', '9', '9', '9', '3', '21', '0.44', '0.5',
        '0.39', '0.28', '0.56', '0.5', '0.61', '1', '', '0.61'],
       ['P2', '7', '6', '', '2', '7', '1', '6', '3', '3', '3', '', '2',
        '6', '2', '0', '6', '0', '10', '0.7', '0.6', '0.3', '', '0.7',
        '0.1', '0.6', '0.3', '0.3', '0.3'],
       ['P3', '7', '11', '8', '9', '10', '8', '7', '13', '12', '6', '14',
        '9', '', '7', '12', '8', '3', '14', '0.36', '0.73', '0.45',
        '0.55', '0.64', '0.45', '0.36', '0.91', '0.82', ''],
       ['P4', '', '8', '13', '5', '9', '6', '', '13', '13', '11', '8',
   

In [116]:
print(test_object.data[0])

['Product_Code' 'W0' 'W1' 'W2' 'W3' 'W4' 'W5' 'W6' 'W7' 'W8' 'W9' 'W10'
 'W11' 'W12' 'W13' 'W14' 'W15' 'MIN' 'MAX' 'Normalized 0' 'Normalized 1'
 'Normalized 2' 'Normalized 3' 'Normalized 4' 'Normalized 5'
 'Normalized 6' 'Normalized 7' 'Normalized 8' 'Normalized 9']


In [115]:
my_test = test_object.data
print(my_test)

[['Product_Code' 'W0' 'W1' 'W2' 'W3' 'W4' 'W5' 'W6' 'W7' 'W8' 'W9' 'W10'
  'W11' 'W12' 'W13' 'W14' 'W15' 'MIN' 'MAX' 'Normalized 0' 'Normalized 1'
  'Normalized 2' 'Normalized 3' 'Normalized 4' 'Normalized 5'
  'Normalized 6' 'Normalized 7' 'Normalized 8' 'Normalized 9']
 ['P1' '11' '12' '10' '8' '13' '12' '14' '21' '' '14' '11' '14' '16' '9'
  '9' '9' '3' '21' '0.44' '0.5' '0.39' '0.28' '0.56' '0.5' '0.61' '1' ''
  '0.61']
 ['P2' '7' '6' '' '2' '7' '1' '6' '3' '3' '3' '' '2' '6' '2' '0' '6' '0'
  '10' '0.7' '0.6' '0.3' '' '0.7' '0.1' '0.6' '0.3' '0.3' '0.3']
 ['P3' '7' '11' '8' '9' '10' '8' '7' '13' '12' '6' '14' '9' '' '7' '12'
  '8' '3' '14' '0.36' '0.73' '0.45' '0.55' '0.64' '0.45' '0.36' '0.91'
  '0.82' '']
 ['P4' '' '8' '13' '5' '9' '6' '' '13' '13' '11' '8' '4' '5' '4' '15' '7'
  '2' '19' '0.59' '0.35' '0.65' '0.18' '0.41' '0.24' '' '0.65' '' '0.53']]


In [95]:
test_object.getType()

quant
Please enter a valid data type.
q
Please enter a valid data type.
qualitative


'qualitative'