### Imports

In [1]:
import nltk
import numpy as np

### Helper functions

In [2]:
def load_text_file(file_path: str) -> np.ndarray[str]:
    """
    Load a text file and return an array of lines from the file.

    Args:
        file_path: str: The path to the file to load.

    Returns:
        np.ndarray: An array of lines from the file.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    return np.array([line.strip() for line in lines])

### Load the text files

In [3]:
train_text_path: str = "../dataset/train_text.txt"
train_label_path: str = "../dataset/train_labels.txt"

test_text_path: str = "../dataset/test_text.txt"
test_label_path: str = "../dataset/test_labels.txt"

validationt_text_path: str = "../dataset/val_text.txt"
validationt_label_path: str = "../dataset/val_labels.txt"

train_text: np.ndarray[str] = load_text_file(train_text_path)
train_label: np.ndarray[str] = load_text_file(train_label_path)

test_text: np.ndarray[str] = load_text_file(test_text_path)
test_label: np.ndarray[str] = load_text_file(test_label_path)

validationt_text: np.ndarray[str] = load_text_file(validationt_text_path)
validationt_label: np.ndarray[str] = load_text_file(validationt_label_path)

## Kelvin's Part

Imports will be moved to the top of this notebook only after the model is accepted

### Preprocessing

In [4]:
import operator
from nltk.sentiment import SentimentIntensityAnalyzer
import sklearn
# Feature selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
# For result
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score

#### Functions

In [5]:
def get_list_tokens(sentence: str) -> list[str]:
    """
    Tokenize a sentence into words.

    Args:
        sentence: a string.

    Returns:
        list: A list of tokenized strings.
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    sentence_split: list[str] = nltk.tokenize.sent_tokenize(sentence)
    list_tokens: list[str] = []
    for substring in sentence_split:
        list_tokens_sentence: list[str] = nltk.tokenize.word_tokenize(substring)
        for token in list_tokens_sentence:
            list_tokens.append(lemmatizer.lemmatize(token).lower())
    return list_tokens


def get_vector_text(list_vocab: list[str], input_string: str) -> np.ndarray[float]:
    """
    Generate a vector representation of the input string based on word frequency.

    Args:
        list_vocab: A list of vocabulary words.
        input_string: The input string.

    Returns:
        np.ndarray: A NumPy array of float representing the vectorized text.
    """
    vector_text: np.ndarray = np.zeros(len(list_vocab))
    list_tokens_string: np.ndarray[str] = get_list_tokens(input_string)
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i] = list_tokens_string.count(word)
    return np.array(vector_text)


def get_sentiment(line: str) -> list[float]:
    """
    Get the score of a word based on sentiment analysis from nltk.

    I (Kelvin) am not using the analyzer directly on the sentence because it
    cannot demonstrate our expertise in applied machine learning.
    I have asked the professor and he allowed us to use SentimentIntensityAnalyzer.

    Args:
        line: A sentence.

    Returns:
        list[float]: The compound score of 10 words in the sentence which has
        the most significant score (far from 0). If the sentence has less than
        10 words, the value of the remaining elements will be 0.

        The size of the list is always 10.
    """
    scores: list[float] = []
    analyzer = SentimentIntensityAnalyzer()
    for word in get_list_tokens(line):
        score: dict[str, float] = analyzer.polarity_scores(word)
        scores.append(score["compound"])

    retult: list[float] = [0.0] * 10

    # Get the 10 most significant scores
    top_10: list[float] = sorted(scores, key=lambda x: abs(x), reverse=True)[:10]

    # Add 0.0 if the length is less than 10
    top_10 += [0.0] * (10 - len(top_10))

    return top_10

In [6]:
# Download NLTK resources
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("vader_lexicon", quiet=True)

# Create a set of stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords.update({".", ",", "--", "``", "''", "@", "#", ":", ";"})
stopwords.update({"&", "(", ")", "-"})

In [7]:
# Create dict_word_frequency
# word is the key, count of the word is the value
dict_word_frequency: dict[str, int] = {}
for line in train_text:
    sentence_tokens: list[str] = get_list_tokens(line)
    for word in sentence_tokens:
        if word in stopwords:
            continue
        if word not in dict_word_frequency:
            dict_word_frequency[word] = 1
        else:
            dict_word_frequency[word] += 1

# Get the top 1000 most frequent words
sorted_list: list[tuple[str, int]] = sorted(
    dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True
)[:1000]

# Show the top 25 most frequent words and their counts
i = 0  # Reset i
for word, frequency in sorted_list[:25]:
    i += 1
    print(f"{str(i)}. {word} - {str(frequency)}")

vocabulary: list[str] = [word for word, _ in sorted_list]

1. ! - 16866
2. user - 16855
3. 's - 8444
4. ... - 7467
5. ? - 7240
6. tomorrow - 7201
7. may - 6636
8. day - 4080
9. n't - 3604
10. wa - 3507
11. night - 3044
12. going - 3028
13. see - 2842
14. friday - 2697
15. sunday - 2611
16. 1st - 2609
17. time - 2564
18. like - 2507
19. get - 2477
20. 'm - 2221
21. go - 2138
22. saturday - 2101
23. amp - 2013
24. game - 1960
25. one - 1960


In [8]:
x_train = []
y_train = []

for i, line in enumerate(train_text):
    # Feature 1: Word frequency
    vector_pos: np.ndarray[float] = get_vector_text(vocabulary, line)
    # Feature 2: Score from SentimentIntensityAnalyzer by nltk
    # This takes around 4 min on my PC (i5-12400)
    score: np.ndarray[float] = np.array(get_sentiment(line))
    x_train.append(np.concatenate((score, vector_pos)))
    y_train.append(float(train_label[i]))

x_train: np.ndarray[np.ndarray[np.float64]] = np.asarray(x_train)
y_train: np.ndarray[np.float64] = np.asarray(y_train)

# Feature selection
feature_selection = SelectKBest(k=500).fit(x_train, y_train)

# Keep for later use, to select the same index on the test set
selected_indices = feature_selection.get_support(indices=True)
x_train = x_train[:, selected_indices]

In [9]:
# Do the same processing on the test set
x_test = []
y_test = []

for i, line in enumerate(test_text):
    # Feature 1: Word frequency
    vector_pos: np.ndarray[float] = get_vector_text(vocabulary, line)
    # Feature 2: Score from SentimentIntensityAnalyzer by nltk
    # This takes around 4 min on my PC (i5-12400)
    score: np.ndarray[float] = np.array(get_sentiment(line))
    x_test.append(np.concatenate((score, vector_pos)))
    y_test.append(float(test_label[i]))

x_test: np.ndarray[np.ndarray[np.float64]] = np.asarray(x_test)
y_test: np.ndarray[np.float64] = np.asarray(y_test)

# Use the same index from train set for x_test
x_test = x_test[:, selected_indices]

### Logistic Regression

### Result - Logistic Regression

### K-NN

### Result - K-NN

### SVM

In [10]:
model = sklearn.svm.SVC(kernel="linear",gamma='auto')
model.fit(x_train, y_train)

In [11]:
# Convert the test set to vector

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5797785737544774


### Result - SVM

## [Name]'s Part

your markdown here