In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

import pandas as pd
import os

In [9]:
# file names
fn = "lab-manual-mm-split"
seeds = [5768, 78516, 944601]

# file directories
train_dir   = "../data/train"
test_dir    = "../data/test"
output_dir  = "../results"

In [69]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import string

# Preprocessing
def preprocess(sentence):
    # lowercase
    sentence = sentence.lower()
    
    words = sentence.split(" ")
    
    # remove punctuations
    words = [word for word in words if word not in string.punctuation]
    
    # remove stopwords
    stop_words = stopwords.words("english")
    words = [word for word in words if word not in stop_words]
    
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    
    return " ".join(words)
    

In [77]:
# load data
train   = pd.read_excel(f"{train_dir}/{fn}-train-{seeds[0]}.xlsx")
test    = pd.read_excel(f"{test_dir}/{fn}-test-{seeds[0]}.xlsx")

# preprocess the textual data
train["sentence"] = train["sentence"].apply(preprocess)

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train["sentence"], train["label"], test_size=0.2)

# vectorization
tfidfvectorizer = TfidfVectorizer(analyzer='word')
X_train = tfidfvectorizer.fit_transform(X_train)
X_test  = tfidfvectorizer.transform(X_test)

# classification
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [74]:
X_train[1]

'thought future development equally likely warrant action either direction, think committee take step probably would cause expectation easing become embedded market interest rates.'

In [78]:
# evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.585635359116022
