In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pickle

In [2]:
train_df = pd.read_csv("/Users/karlpotenciano/Exams/sail-sentiment-classifier/artifacts/data/processed_data/tweets/train.csv", sep=';')
val_df = pd.read_csv("/Users/karlpotenciano/Exams/sail-sentiment-classifier/artifacts/data/processed_data/tweets/val.csv", sep=';')

In [3]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(train_df['text'])

In [4]:
val_vectors = vectorizer.transform(val_df['text'])

In [5]:
model = SVC(kernel='linear')
model.fit(train_vectors, train_df['expected_sentiment'])

In [6]:
val_predictions = model.predict(val_vectors)

In [7]:
with open("/Users/karlpotenciano/Exams/sail-sentiment-classifier/artifacts/models/svc_tweets.pickle", "wb") as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
with open("/Users/karlpotenciano/Exams/sail-sentiment-classifier/artifacts/tokenizers_and_encoders/svc_vectorizer_tweets.pickle", "wb") as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
print(classification_report(val_df['expected_sentiment'], val_predictions))

              precision    recall  f1-score   support

    negative       0.92      0.87      0.90      8878
     neutral       0.96      0.98      0.97     13803
    positive       0.94      0.96      0.95     18063

    accuracy                           0.95     40744
   macro avg       0.94      0.94      0.94     40744
weighted avg       0.95      0.95      0.95     40744



In [3]:
test_dataset = pd.read_csv("/Users/karlpotenciano/Exams/sail-sentiment-classifier/artifacts/data/test_dataset/sentiment_test_cases.csv")

In [4]:
test_vectorized = vectorizer.transform(test_dataset['text'])

In [5]:
test_prediction = model.predict(test_vectorized)

In [7]:
print(classification_report(test_dataset['expected_sentiment'], test_prediction))

              precision    recall  f1-score   support

    negative       0.81      0.42      0.55       177
     neutral       0.55      0.68      0.61       139
    positive       0.58      0.76      0.66       182

    accuracy                           0.61       498
   macro avg       0.65      0.62      0.61       498
weighted avg       0.66      0.61      0.61       498

