### Basic Workflow

1. Load and explore the data
2. Preprocess the data
3. Extract features
4. Train the model
5. Evaluate the model
6. Analyze model behavior

In [None]:
import os
import spacy

import numpy as np
import pandas as pd
import pickle
from typing import TypeAlias

In [None]:
FOLDER = "/Users/johnhuang/Desktop/coding/Bag_of_Words/Data"
FILEPATH = f"{FOLDER}/Tweets_5K.csv"

In [None]:
def load_data(filepath: str) -> tuple[list[str], list[int]]:
    """
    Loads Twitter data into two lists.

    Returns
    -------
    raw_tweets : list[str]
        A list of all Tweets in the dataset
    labels : list[int]
        A list of the sentiments corresponding to each raw tweet encoded as integers,
        -1 meaning negative, 0 meaning neutral, and 1 meaning positive
    """
    dataset = pd.read_csv(filepath)
    raw_tweets = dataset["text"].astype(str).tolist()
    labels=[]
    for label in dataset["sentiment"].astype(str).tolist():
      if label == "neutral":
        labels.append(0)
      elif label == "negative":
        labels.append(-1)
      else:
        labels.append(1)
    return (raw_tweets, labels)

In [None]:
raw_tweets, labels = load_data(FILEPATH)
for p, label in zip(raw_tweets[:10], labels[:10]):
    print(f"{label}:\t{p}\n")

In [None]:
import matplotlib.pyplot as plt

plt.figure()
pd.Series(labels).value_counts().plot.bar(title="Sentiment Distribution in Tweets")
plt.xticks(rotation=0)
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.show()