In [86]:
#* project_deliverable_2.ipynb
#*
#* ANLY 555 2023
#* Project <>
#*
#* Due on: 10/04/2023
#* Author(s): Landon Carpenter
#*
#*
#* In accordance with the class policies and Georgetown's
#* Honor Code, I certify that, with the exception of the
#* class resources and those items noted below, I have neither
#* given nor received any assistance on this project other than
#* the TAs, professor, textbook and teammates.
#*

import csv
import nltk
import numpy as np
import matplotlib.pyplot as plt
import os
import wordcloud

#create dataset class
class DataSet:
    """
    Class for managing the dataset
    
    Attribute:
        filename (str): the name of the file to be read in
    """

    #constructor
    def __init__(self, filename):
        """
        Initializes the DataSet class
        """
        self.filename = filename
        self.data = None

    #create the framework and stubs for __readFromCSV, __load, clean, and explore
    def __readFromCSV(self, filename):
        """
        Reads in the data from a CSV file
        """
        print(f"Reading from {filename}...")

        #try to read the data from each file, if there is an error then print the error
        #implement without csv or other libraries
        try:
            with open(filename, 'r') as f:
                file = []

                for line in f:
                    file.append(line.strip().split(','))

                self.data = np.array(file)
                
        except Exception as e:
            print(f"Error reading from {filename}: {str(e)}")

    #abstract base class (ABC)
    def __load(self, filename):
        """
        Loads the data from a CSV file
        """
        print(f"Loading {filename}...")

        self.__readFromCSV(filename)

    def getType(self):
        """
        This function will be called later in each of the child classes to determine the type of data
        """

        #using while True to avoid infinite loop that I had earlier
        while True:
            data_type = input("Is this data Time Series, Text, Quantitative, or Qualitative? \nPlease type 'Time', 'Text', 'Quantitative', or 'Qualitative'.")
            #trying to make the prompt a little more forgiving by making the input lowercase and removing whitespace before checking for validity
            norm = data_type.lower().strip()

            #make sure the type is valid
            if norm in ['time', 'text', 'quantitative', 'qualitative']:
                return norm

            #if the type is not valid they will see this message and be prompted to try again
            else:
                print("Please enter a valid data type.")


    def clean(self):
        """
        Cleans the data
        """
        print("Cleaning...")

    def explore(self):
        """
        Explores the data
        """
        print("Exploring...")

#use inheritance to create TimeSeriesDataSet class
class TimeSeriesDataSet(DataSet):
    """
    Class for managing the time series dataset
    """

    #constructor
    def __init__(self, filename):
        """
        Initializes the TimeSeriesDataSet class
        """
        super().__init__(filename)
    #override the clean and explore methods from the DataSet class to be specific to the TimeSeriesDataSet class
    def clean(self):
        """
        Cleans the time series data set
        """
        print("Cleaning Time Series Data Set...")
    def explore(self):
        """
        Explores the time series data set
        """
        print("Exploring Time Series Data Set...")

#use inheritance to create QuantDataSet class
class QuantDataSet(DataSet):
    """
    Class for managing the quantitative dataset
    """

    #constructor
    def __init__(self, filename):
        """
        Initializes the QuantDataSet class
        """
        super().__init__(filename)
    #override the clean and explore methods from the DataSet class to be specific to the QuantDataSet class
    def clean(self):
        """
        Cleans the quantitative data set
        """
        print("Cleaning Quant Data Set...")
    def explore(self):
        """
        Explores the quantitative data set
        """
        print("Exploring Quant Data Set...")

#use inheritance to create QualDataSet class
class QualDataSet(DataSet):
    """
    Class for managing the qualitative dataset
    """

    #constructor
    def __init__(self, filename):
        """
        Initializes the QualDataSet class
        """
        super().__init__(filename)
    #override the clean and explore methods from the DataSet class to be specific to the QualDataSet class
    def clean(self):
        """ 
        Cleans the qualitative data set
        """
        print("Cleaning Qual Data Set...")
    def explore(self):
        """
        Explores the qualitative data set
        """
        print("Exploring Qual Data Set...")


#create class for the classifier 
class ClassifierAlgotithm:
    """
    Class for managing the classifier algorithm
    """
    def __init__(self):
        """
        Initializes the ClassifierAlgotithm class
        """
        pass

    def train(self):
        """
        Trains the classifier algorithm
        """
        print("Training...")

    def test(self):
        """
        Tests the classifier algorithm
        """
        print("Testing...")

#create class for simpe KNN that inherets from ClassifierAlgotithm
class simpleKNNClassifier(ClassifierAlgotithm):
    """
    Class for managing the simple KNN classifier
    """
    def __init__(self):
        """
        Initializes the simpleKNNClassifier class
        """
        super().__init__()

#create class for kdTree KNN that inherets from ClassifierAlgotithm
class kdTreeKNNClassifier(ClassifierAlgotithm):
    """
    Class for managing the kdTree KNN classifier
    """
    def __init__(self):
        super().__init__()

#create the Experiment class that will run cross validation, get a score given k and, and create a confusion matrix
class Experiment:
    """
    Class for managing the experiment
    """

    def __init__(self):
        """
        Initializes the Experiment class
        """
        pass

    def runCrossVal(self, k):
        """
        Runs k-fold cross validation

        Args:
            k (int): the number of folds to use
        """
        print(f"Running {k}-fold cross validation...")

    def score(self):
        """
        Scores the experiment
        """
        print("Scoring...")

    def __confusionMatrix(self):
        """
        Creates a confusion matrix
        """
        print("Creating confusion matrix...")

#main function: if the py file is ran directly then run the main function and perform the following (rather than importing the file and using a function, for example)
if __name__ == "__main__":
    print("Running main to show framework and stubs...")
    print("\n")
    print("---Creating dataset---")
    dataset = DataSet("my_data.csv")
    dataset._DataSet__readFromCSV("my_data.csv")
    dataset._DataSet__load("my_data.csv")
    dataset.clean()
    dataset.explore()
    print("\n")
    
    print("---Creating TimeSeriesDataSet--- ")
    ts_dataset = TimeSeriesDataSet("my_data.csv")
    ts_dataset._DataSet__readFromCSV("my_data.csv")
    ts_dataset._DataSet__load("my_data.csv")
    ts_dataset.clean()
    ts_dataset.explore()
    print("\n")

    print("---Creating QuantDataSet---")
    quant_dataset = QuantDataSet("my_data.csv")
    quant_dataset._DataSet__readFromCSV("my_data.csv")
    quant_dataset._DataSet__load("my_data.csv")
    quant_dataset.clean()
    quant_dataset.explore()
    print("\n")

    print("---Creating QualDataSet---")
    qual_dataset = QualDataSet("my_data.csv")
    qual_dataset._DataSet__readFromCSV("my_data.csv")
    qual_dataset._DataSet__load("my_data.csv")
    qual_dataset.clean()
    qual_dataset.explore()
    print("\n")

    print("---Creating simple KNN classifiers---")
    simple_knn = simpleKNNClassifier()
    simple_knn.train()
    simple_knn.test()
    print("\n")

    print("---Creating kdTree KNN classifiers--- ")
    kd_tree_knn = kdTreeKNNClassifier()
    kd_tree_knn.train()
    kd_tree_knn.test()
    print("\n")

    print("---Creating experiment--- ")
    experiment = Experiment()
    experiment.runCrossVal(5)
    experiment.score()
    experiment._Experiment__confusionMatrix()

Running main to show framework and stubs...


---Creating dataset---
Reading from my_data.csv...
Error reading from my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Cleaning...
Exploring...


---Creating TimeSeriesDataSet--- 
Reading from my_data.csv...
Error reading from my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Cleaning Time Series Data Set...
Exploring Time Series Data Set...


---Creating QuantDataSet---
Reading from my_data.csv...
Error reading from my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Cleaning Quant Data Set...
Exploring Quant Data Set...


---Creating QualDataSet---
Reading from my_data.csv...
Error reading from my_data.csv: [Errno 2] No such file or directory: 'my_data.csv'
Cleaning Qual Data Set...
Exploring Qual Data Set...


---Creating simple KNN classifiers---
Training...
Testing...


---Creating kdTree KNN classifiers--- 
Training...
Testing...


---Creating experiment--- 
Running 5-fold cross validation...
S

In [87]:
#read in "test.csv" as my_test using DataSet and __readFromCSV
test_object = DataSet("test.csv")

In [88]:
test_object._DataSet__readFromCSV("test.csv")

Reading from test.csv...


In [89]:
print(test_object)

<__main__.DataSet object at 0x0000029087404ED0>


In [90]:
test_object.data

array([['this', ' is', ' a test', ' for', ' project',
        ' deliverable number two.']], dtype='<U24')

In [91]:
my_test = test_object.data[0]
print(my_test[-1])
print(my_test)

 deliverable number two.
['this' ' is' ' a test' ' for' ' project' ' deliverable number two.']


In [95]:
test_object.getType()

quant
Please enter a valid data type.
q
Please enter a valid data type.
qualitative


'qualitative'