In [1]:
NAME = "Katelyn Rohrer"
EMAIL = "katelynrohrer@arizona.edu"

Including all necessary imports

In [2]:
import os
import re
import numpy as np
import pandas as pd
from typing import Iterator, Iterable, List, Tuple, Text, Union

from scipy.sparse import spmatrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# from sklearn.feature_extraction import text

NDArray = Union[np.ndarray, spmatrix]

#### I started by using the classes defined in our previous assignments: TextToFeatures, TextToLabels, and Classifier.
These models were helpful in defining the baseline classification system. I did, however, make some modifications to the exact models used.

For my TextToFeatures class, I found that TF-IDF worked the best.

In [3]:
class TextToFeatures:
    """
    Converts raw text into tf-idf vectors.
    """
    def __init__(self):
        """
        Creates the vectorizer.
        Word-level preprocessing where any URLs are stripped from tokenization.
        Bigram and unigram processing is used, and stop words are not dropped.
        """
        self.vect = TfidfVectorizer(lowercase=True,
                                    preprocessor=self.strip_urls,
                                    stop_words=None,
                                    ngram_range=(1, 2),
                                    analyzer="word",
                                    binary=False)

    def strip_urls(self, training_texts: str):
        """
        Helper method to remove URLs from text.
        :return: Str text without URLs
        """
        return re.sub(r"http\S+|www\S+|https\S+", "", training_texts, flags=re.IGNORECASE)

    def fit(self, training_texts: Iterable[Text]) -> None:
        """
        Fits the training data using tf-idf
        """
        self.vect.fit(training_texts)

    def transform(self, texts: Iterable[Text]) -> NDArray:
        """
        Transforms the text into vectors
        :return: NDArray of vectors from the text
        """
        return self.vect.transform(texts)

Then, we need to define IDs for our labels so that our model
can assign our vectors to them.
I made no significant changes to this class relative to my previous assignment.

In [4]:
class TextToLabels:
    """
    Converts the sentiment labels into integer values for labelling. 
    """
    def __init__(self):
        """
        Defines the LabelEncoder object.
        """
        self.encoder = LabelEncoder()

    def fit(self, training_labels: Iterable[Text]) -> None:
        """
        Learns the unique labels and assigns them IDs
        """
        self.encoder.fit(training_labels)

    def transform(self, labels: Iterable[Text]) -> NDArray:
        """
        Converts each label into their corresponding ID.
        :return: The ID labels as integers
        """
        return self.encoder.transform(labels)

Lastly, we need to define a classifier that can classify new
text as one of the labels.
Here, we use logistic regression for our analysis, as that tends to work best with tf-idf.

In [5]:
class Classifier:
    """
    Defines a logistic regression classifier that can read in
    new text and classify the sentiment as positive, neutral,
    or negative
    """
    def __init__(self):
        """
        Creates the classifier. Sets a threshold for the max
        number of iterations (for efficiency) and smooths some
        of the data.
        """
        self.classifier = LogisticRegression(max_iter=1000, class_weight="balanced", C=0.5)

    def train(self, features: NDArray, labels: NDArray) -> None:
        """
        Trains the classifier on our features and labels to be able to classify text
        """
        self.classifier.fit(features, labels)

    def predict(self, features: NDArray) -> NDArray:
        """
        Applies the model to predict the label of new text
        :return: Predicted labels for each of the features given
        """
        return self.classifier.predict(features)

    def translate(self, prediction: NDArray, labels: List[str]) -> List[str]:
        """
        Translates the numerical labels into their corresponding str values
        :return: The ID labels as strings
        """
        return [labels[idx] for idx in prediction]

Lastly, we should also define how we are going to score our model.
For simplicity, I created a wrapper function, but the baseline analysis comes from an import
from sklearn which provides precision, recall, and f1 score.

In [6]:
def score(df, predictions):
    print("Train" if df.columns.all() == train_df.columns.all() else "Test")  # this isn't the most graceful, but it works to auto-label the output
    print(classification_report(df["sentiment"], predictions, digits=4))

Now, we can start using the classes defined to train our classifier and label our data.
We start by reading in our training data, traing the classifier, then use the classifier on that data.

Note that we used a pandas dataframe here. Personally, I have a lot of experience with pandas,
and I found this to be a perfect use-case for it.

In [7]:
train_path = os.path.join("data", "train.csv")
test_path = os.path.join("data", "test.csv")
sentiments = ["negative", "neutral", "positive"]

In [8]:
train_df = pd.read_csv(train_path)
train_df["text"] = train_df["text"].fillna("")
train_df["selected_text"] = train_df["selected_text"].fillna("")

# training on the entire text, prioritizing the selected text
# (selected text exists both in text and selected_text, so selected text apprears twice)
train_df["weighted_text"] = train_df["selected_text"] + " " + train_df["text"]

model = TextToFeatures()
model.fit(train_df["weighted_text"])
train_features = model.transform(train_df["weighted_text"])

labeler = TextToLabels()
labeler.fit(train_df["sentiment"])
labels = labeler.transform(train_df["sentiment"])

clf = Classifier()
clf.train(train_features, labels)

train_labels_predicted = clf.translate(clf.predict(train_features), sentiments)

score(train_df, train_labels_predicted)

Train
              precision    recall  f1-score   support

    negative     0.8860    0.8987    0.8923      7781
     neutral     0.8570    0.8811    0.8689     11118
    positive     0.9073    0.8624    0.8843      8582

    accuracy                         0.8802     27481
   macro avg     0.8834    0.8807    0.8818     27481
weighted avg     0.8809    0.8802    0.8803     27481



We can see here that our accuracy is 88%, as well as high 80s for precision and recall, which is a fairly good metric.
Since it's not overly inflated, this implies that our model will be robust enough for the testing data.

Moving on, now using the same model, we can classify our testing data. 

In [9]:
test_df = pd.read_csv(test_path)
test_df["text"] = test_df["text"].fillna("")

test_features = model.transform(test_df["text"])

test_labels_predicted = clf.translate(clf.predict(test_features), sentiments)

score(test_df, test_labels_predicted)

Test
              precision    recall  f1-score   support

    negative     0.6810    0.6653    0.6731      1001
     neutral     0.6425    0.7140    0.6764      1430
    positive     0.7963    0.6981    0.7440      1103

    accuracy                         0.6952      3534
   macro avg     0.7066    0.6925    0.6978      3534
weighted avg     0.7014    0.6952    0.6965      3534



We can see that our numbers definitely dropped, but they are still relatively high.
Accuracy remains just under 70% and precision and recall each have similar metrics.

These were the highest stats that I was able to procure. I also attempted changing stop-words,
n-gram counts, and changing my TextToFeature and Classifier models.

Now that our testing data is classified, we move on to step 2: determining which phrase contributes most to the output.
I started by simply attempting to extract the tokens that contribute most to the output label, without consideration
for keeping a singular phrase together. I used this function primarily to get a sense of the data, not for actual output-sake.
Since I was just looking at the output manually, I stopped by output after 10 iterations.

In [10]:
def explain_prediction(features, clf: Classifier, model: TextToFeatures):
    limit = 10
    for i, df_row in test_df.iterrows():
        feature_names = model.vect.get_feature_names()
        coefs = clf.classifier.coef_

        row = features[i]
        row_indices = row.nonzero()[1]

        pred = clf.predict(row)[0]

        impacts = [(feature_names[j], coefs[pred][j]) for j in row_indices]
        top_impacts = sorted(impacts, key=lambda x: -abs(x[1]))[:5]

        print(f"{sentiments[pred].capitalize()} (actual: {df_row['sentiment']}):\n\"{df_row['text']}\"")
        for word, score in top_impacts:
            print(f"  {word}: weight={score:.4f}")
        print()

        i += 1
        if i == limit:
            break
            
explain_prediction(train_features, clf, model)

Neutral (actual: neutral):
"Last session of the day  http://twitpic.com/67ezh"
  if: weight=0.7401
  going: weight=0.5875
  have: weight=-0.1857
  if were: weight=0.1739
  were: weight=0.0829

Negative (actual: positive):
" Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China:  (SH)  (BJ)."
  miss: weight=5.4277
  you: weight=-2.0338
  miss you: weight=1.0837
  will: weight=-0.4848
  SAD: weight=0.4845

Negative (actual: negative):
"Recession hit Veronique Branquinho, she has to quit her company, such a shame!"
  my: weight=1.8043
  me: weight=1.5423
  is: weight=0.7767
  my boss: weight=0.1849
  bullying me: weight=0.1481

Negative (actual: positive):
" happy bday!"
  me: weight=1.5423
  alone: weight=0.6269
  leave: weight=0.5019
  what: weight=-0.2786
  leave me: weight=0.2318

Negative (actual: positive):
" http://twitpic.com/4w75p - I like it!!"
  couldn: weight=0.8179
  already: weight=0.7928
  we: weight=-0.4450
  bought: weight=-0.3572
  them

Once I got a sense of the data, my next attempt was to keep my phrases together.
I started by extracting subsets of each tweet of all lengths (1 to length of the tweet) and scoring them on their
similarity to the label (previously defined label). Then, it determines which subset of text has the highest contribution
towards the label and defines that as the "selected text". I aggregate this data into a dataframe and return that. 

Note that longer phrases are preferred by this function (line 48), as I was initially only getting the highest contributing single token. 

In [11]:
def extract_top_phrases(df, features, clf: Classifier, model: TextToFeatures):
    """
    Reads in the data, model, and classifier, and uses them to classify each subset of the text.
    It then finds the highest scoring subset of text towards the given label and sets that as the
    selected text for that given text. This happens for all text entries within the df.
    :return: Dataframe containing only the textID and the selected text for that message. 
    """
    vocab = model.vect.vocabulary_
    coefs = clf.classifier.coef_

    results = []
    for i, row in df.iterrows():
        print("\r", i, end="")
        raw_text = row["text"]
        pred = clf.predict(features[i])[0]

        # neutral tweets don't have a key sentiment selected text
        if sentiments[pred] == "neutral":
            results.append({
                "textID": row.get("textID", i),
                "selected_text": raw_text
            })
            continue

        token_spans = list(re.finditer(r"\b\w+\b", raw_text))
        tokens = [m.group().lower() for m in token_spans]

        best_phrases = []
        for n in range(len(tokens)):
            for j in range(len(tokens) - n + 1):
                span_tokens = tokens[j:j + n]
                span = " ".join(span_tokens)
                total_score = 0.0
                count = 0

                for word in span_tokens:
                    idx = vocab.get(word)
                    if idx is not None:
                        total_score += coefs[pred][idx]
                        count += 1

                idx = vocab.get(span)
                if idx is not None:
                    total_score += coefs[pred][idx]
                    count += 1

                if count > 0:
                    avg_score = total_score / count
                    weighted_score = avg_score * n  # favor longer spans
                    best_phrases.append((j, j + n - 1, weighted_score))

        # sort the best phrases and select the top scoring oe as the selected text
        if best_phrases:
            best_phrases.sort(key=lambda x: -abs(x[2]))
            start_i, end_i, _ = best_phrases[0]
            start_char = token_spans[start_i].start()
            end_char = token_spans[end_i].end()
            selected = raw_text[start_char:end_char]
        else:
            selected = ""

        results.append({
            "textID": row.get("textID", i),
            "selected_text": selected
        })

    return pd.DataFrame(results)

In order to test this function, I used it on the training data and compared my output to the
text selected by the training data. It should iterate about 27000 times.

In [12]:
train_output = extract_top_phrases(train_df, train_features, clf, model)
train_output.to_csv("train_submission.csv", index=False)

 27480

Since the function is so slow, optionally, you can also just load it from my saved file.

In [13]:
train_output = pd.read_csv("train_submission.csv")

Lastly, I needed some way to compare the selected text that I extracted vs the selected text that
was given in the training data. On my first attempt, I tried just checking for how many words in the
actual testing data was in my phrase as well, but I quickly realized that metric inherently favored longer phrases.
In my second attempt, I wrote the following function, which uses Jaccard similarity to score the overlap between my two phrases.
I did edit my extract_top_phrases several times in attempt to get this score higher.

In [14]:
def compare_to_train():
    scores = []
    for (idx, trow), (_, orow) in zip(train_df.iterrows(), train_output.iterrows()):
        trow_words = set(trow["selected_text"].split())
        orow_words = set(str(orow["selected_text"]).split())
        overlap_score = len(trow_words & orow_words) / len(trow_words | orow_words) if trow_words | orow_words else 0
        scores.append(overlap_score)
    print(f"Average overlap of selected text: {sum(scores)/len(scores)}")
    
compare_to_train()

Average overlap of selected text: 0.5612093413182613


The score I got in the end was 0.56, which indicates that just over half of the text I selected is the same as the testing data.
I decided that this was a fairly good score, since it also relies on me previously having labelled the data correctly.

Lastly, I ran the same phrase extraction on my testing data and outputted that to my final submission file.
This file is what I would submit to Kaggle as my results. This run of `extract_top_phrases` is much faster, only about 3500 iterations.

In [15]:
test_output = extract_top_phrases(test_df, test_features, clf, model)
test_output.to_csv("test_submission.csv", index=False)

 3533