In [32]:
from typing import Iterator, Iterable, List, Tuple, Text, Union
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import csv

In [33]:
# An NDArray can either be a numpy array (np.ndarray) or a sparse matrix (spmatrix)
NDArray = Union[np.ndarray, spmatrix]

In [34]:
def read_data(data_path: Text) -> Iterator[Tuple[Text, Text, Text]]:
    with open(data_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row if present
        for row in reader:
            yield tuple(row)

In [35]:
def prepare_data(data_path: Text) -> Tuple[List[Text], List[Text]]:
    texts = []
    labels = []
    for _, text, label in read_data(data_path):
        texts.append(text)
        labels.append(label)
    return texts, labels

In [36]:
all_data = read_data("data/train.csv")

In [37]:
#format is ID TEXT LABEL

In [38]:
class TextToFeatures:
    def __init__(self):
        """
        Initializes an object for converting texts to features.    
        """
        
        # HINT: you may want to use a sklearn vectorizer. Be sure you've
        #  worked through the sklearn tutorial in the course website and read
        #  the documentation on sklearn's vectorizers at
        # https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text
        #
        # Learning to use a library based only on reading its documentation
        #  is one of the skills that we hope you'll learn in this course!
        #
        # If you take this approach, your code here will largely just be
        #  calls to the corresponding methods of the sklearn vectorizer
        #  that you choose. Don't forget to add the import statement
        #  for your chosen vectorizer in the imports cell in the Assignment
        #  Overview.
        
        self.vectorizer = CountVectorizer(
              # case fold all text 
              # before generating n-grams
              lowercase=True,
              # optionally apply the specified function
              # before counting n-grams
              preprocessor=None,
              # optionally provide a list of tokens to remove/ignore before generating n-grams
              stop_words=None,
              # specify a range of n-grams as (min_n, max_n). 
              # (1, 1) means unigrams.
              # (1, 2) means unigrams and bigrams
              # (4, 5) means 4-grams and 5-grams
              ngram_range=(1, 2),
              # "word", "char" (character), or "char_wb" n-grams
              analyzer="word",
              # whether or not to use binary counts
              binary=False
            )

    def fit(self, training_texts: Iterable[Text]) -> None:
        """
        Fits ("trains") a TextToFeature instance on a collection of documents.
        
        The provided training texts are analyzed to determine the vocabulary, 
        i.e., all feature values that the converter will support. 
        Each such feature value will be associated with a unique integer index 
        that may later be accessed via the .index() method.

        It is up to the implementer exactly what features to produce from a
        text, but the features will always include some single words and some
        multi-word expressions (e.g., "need" and "to you").
        
        
        docs = [
            "LOL. is this u? http://supersketchyurl.com/dangerous",
            "The IRS has been trying to reach you.",
            "Enclosed is your Coyote Joe's Marketplace Rewards Card."
            "Logan I'd like to add you to my professional network on LinkedIn",
        ]
        
        t2f = TextToFeatures()
        t2f.fit(docs)

        :param training_texts: The training texts.
        """
        self.vectorizer.fit(training_texts)

        
        
    def index(self, feature: Text) -> Union[None, int]:
        """
        Returns the index in the vocabulary of the given feature value.  
        If the features isn't present, return None.

        :param feature: A feature
        :return: The unique integer index associated with the feature or None if not present.
        """
        if feature not in self.vectorizer.vocabulary_:
            return None
        else:
            return self.vectorizer.vocabulary_[feature]

    def transform(self, texts: Iterable[Text]) -> NDArray:
        """
        Creates a feature matrix from a sequence of texts.
        
        docs = [
            "LOL. is this u? http://supersketchyurl.com/dangerous",
            "The IRS has been trying to reach you.",
            "Enclosed is your Coyote Joe's Marketplace Rewards Card."
            "I'd like to add you to my professional network on LinkedIn",
        ]
        
        t2f = TextToFeatures()
        t2f.fit(docs)

        # this produces a NDArray representing our features for the provided doc
        t2f.transform(["Let's meet at Coyote Joe's at 6."])


        Each row of the matrix corresponds to one of the input texts. The value
        at index j of row i is the value in the ith text of the feature
        associated with the unique integer j.

        It is up to the implementer what the value of a feature that is present
        in a text should be, though a common choice is 1. Features that are
        absent from a text will have the value 0.

        :param texts: A sequence of texts.
        :return: A matrix, with one row of feature values for each text.
        """
        return self.vectorizer.transform(texts)


In [39]:
class TextToLabels:
    def __init__(self):
        """
        Initializes an object for converting texts to labels.
        """
        
        # HINT: As with the previous class, you may choose to use an sklearn
        #  class here. See the documentation at
        # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
        #
        # As before, be sure to add an import statement for the sklearn class
        #  that you choose in the (editable) import cell towards the top of
        #  this notebook.
        
        self.encoder = LabelEncoder()

    def fit(self, training_labels: Iterable[Text]) -> None:
        """
        Assigns each distinct label a unique integer.
        
        
        Training labels are analyzed to determine the vocabulary, 
        i.e., all labels that the converter will support. 
        Each such label will be associated with a unique integer index 
        that may later be accessed via the .index() method.

        :param training_labels: The training labels.
        """
        self.encoder.fit(training_labels)
        
    def index(self, label: Text) -> Union[None, int]:
        """Returns the index in the vocabulary of the given label.

        :param label: A label
        :return: The unique integer index associated with the label.
        """
        if label not in self.encoder.classes_:
            return None
        else:
            return np.where(self.encoder.classes_ == label)[0][0]

    def transform(self, labels: Iterable[Text]) -> NDArray:
        """
        Creates a label vector from a sequence of labels.

        Each entry in the vector corresponds to one of the input labels. The
        value at index j is the unique integer associated with the jth label.

        :param labels: A sequence of labels.
        :return: A vector, with one entry for each label.
        """
        return self.encoder.transform(labels)
        
        
    def __contains__(self, label: Text) -> bool:
        """
        Special "dunder" method to check if a label is known to the TextToLabels instance.
        
        labeler = TextToLabels()
        labeler.fit(["POSITIVE", "NEGATIVE"])

        # should be True:
        "POSITIVE" in labeler 
        
        # should be False:
        "MBOP" in labeler
        
        :return: True if the label was seen in the training data; False otherwise
        """
        # NOTE: you do not need to change this if you've implemented .index() correctly!
        return False if self.index(label) is None else True

    def inverse_transform(self, labels: NDArray) -> Iterable[Text]:
        return self.encoder.inverse_transform(labels)

In [40]:
class Classifier:
    def __init__(self):
        """
        Initalizes a logistic regression classifier.
        """

        # HINT: See the documentation at
        # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
        #  Think about the input parameter values you'll need to use.
        #
        # As before, be sure to add an import statement for the sklearn class
        #  that you choose in the (editable) import cell towards the top of
        #  this notebook.
        
        self.classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)


    def train(self, features: NDArray, labels: NDArray) -> None:
        """
        Trains the classifier using the given training examples.

        :param features: A feature matrix, where each row represents a text.
        Such matrices will typically be generated via TextToFeatures.
        :param labels: A label vector, where each entry represents a label.
        Such vectors will typically be generated via TextToLabels.
        """
        self.classifier.fit(features, labels)
    
    # just an alias for "train"
    #fit = train
    
    def predict(self, features: NDArray) -> NDArray:
        """Makes predictions for each of the given examples.

        :param features: A feature matrix, where each row represents a text.
        Such matrices will typically be generated via TextToFeatures.
        :return: A prediction vector, where each entry represents a label.
        """
        return self.classifier.predict(features)

In [43]:
print(len(training_texts))
print(len(training_labels))

70317
0


In [44]:
def process_and_predict(train_path: Text, test_path: Text) -> List[Tuple[Text, Text]]:
    training_texts, training_labels = prepare_data(train_path)
    feature_converter = TextToFeatures()
    label_converter = TextToLabels()
    feature_converter.fit(training_texts)
    label_converter.fit(training_labels)

    training_features = feature_converter.transform(training_texts)
    training_numeric_labels = label_converter.transform(training_labels)

    clf = Classifier()
    clf.train(training_features, training_numeric_labels)

    test_ids = []
    test_texts = []
    for row in read_data(test_path):
        test_id, text = row[0], row[1]
        test_ids.append(test_id)
        test_texts.append(text)

    test_features = feature_converter.transform(test_texts)
    predicted_labels = clf.predict(test_features)
    predicted_labels_text = label_converter.inverse_transform(predicted_labels)

    return list(zip(test_ids, predicted_labels_text))



In [46]:
predictions = process_and_predict("data/train.csv", "data/test.csv")

In [49]:
with open("data/attempt2.csv", 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['ID', 'LABEL'])  # Writing the header
        writer.writerows(predictions)