In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('data/original/total.csv', sep=",", error_bad_lines=False, encoding= "unicode_escape")
test_data = pd.read_csv('data/test/sentiment-topic-final-test.tsv', sep='\t', encoding='utf-8')



  df = pd.read_csv('data/original/total.csv', sep=",", error_bad_lines=False, encoding= "unicode_escape")


## Conventional Machine Learning Approach: SVM

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
# vectorizer taken from previous labs setup
count_vectorizer = CountVectorizer(min_df=2, # If a token appears fewer times than this, across all documents, it will be ignored
                             tokenizer=nltk.word_tokenize, # we use the nltk tokenizer
                             stop_words=stopwords.words('english')) # stopwords are removed



counts = count_vectorizer.fit_transform(df['text'])

# TF-IDF
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(counts)

# split the data into train and dev
docs_train, docs_dev, y_train, y_dev = train_test_split(
    tfidf, # the tf-idf model
    df['topic'], # the labels
    test_size=0.1, # the size of the dev set
    random_state=0, # random seed
    )

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/irem.demir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/irem.demir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# SVM classifier from sklearn

# train the model
svm = LinearSVC()
svm.fit(docs_train, y_train)

# evaluate the model
y_pred = svm.predict(docs_dev)

print("Dev Accuracy:", accuracy_score(y_dev, y_pred))
print('Dev report:\n')

print(classification_report(y_dev, y_pred))

Dev Accuracy: 0.9677777777777777
Dev report:

              precision    recall  f1-score   support

        book       0.97      0.97      0.97       286
       movie       0.95      0.97      0.96       307
  restaurant       0.99      0.97      0.98       307

    accuracy                           0.97       900
   macro avg       0.97      0.97      0.97       900
weighted avg       0.97      0.97      0.97       900



In [5]:
# evaluate the model on the test set
docs_test = count_vectorizer.transform(test_data['text'])
docs_test = tfidf_transformer.transform(docs_test)

y_pred = svm.predict(docs_test)

print("Test Accuracy:", accuracy_score(test_data['topic'], y_pred))

print('Test report:\n')
print(classification_report(test_data['topic'], y_pred))

Test Accuracy: 0.9
Test report:

              precision    recall  f1-score   support

        book       1.00      1.00      1.00         2
       movie       0.83      1.00      0.91         5
  restaurant       1.00      0.67      0.80         3

    accuracy                           0.90        10
   macro avg       0.94      0.89      0.90        10
weighted avg       0.92      0.90      0.89        10



# TRANSFORMER: ROBERTA

In [5]:

train, dev = train_test_split(df, test_size=0.1, random_state=1, 
                               stratify=df[['topic']])


In [11]:
# define transformer based topic classifier for 3 topics
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs()

model_args.overwrite_output_dir=True # overwrite existing saved models in the same directory
model_args.evaluate_during_training=True # to perform evaluation while training the model
# (eval data should be passed to the training method)

model_args.num_train_epochs=10 # number of epochs
model_args.train_batch_size=32 # batch size
model_args.learning_rate=4e-6 # learning rate
model_args.max_seq_length=256 # maximum sequence length
# Note! Increasing max_seq_len may provide better performance, but training time will increase. 
# For educational purposes, we set max_seq_len to 256.

# Early stopping to combat overfitting: https://simpletransformers.ai/docs/tips-and-tricks/#using-early-stopping
model_args.use_early_stopping=True
model_args.early_stopping_delta=0.01 # "The improvement over best_eval_loss necessary to count as a better checkpoint"
model_args.early_stopping_metric='eval_loss'
model_args.early_stopping_metric_minimize=True
model_args.early_stopping_patience=2
model_args.evaluate_during_training_steps=32 # how often you want to run validation in terms of training steps (or batches)


In [13]:
# Create a ClassificationModel
model = ClassificationModel('bert', 'bert-base-uncased', num_labels=3, args=model_args, use_cuda=False)

# Fine-tune the model
model.train_model(train)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(dev)

# print the results
print(result)

# classification report
print(classification_report(dev['topic'], model_outputs.argmax(axis=1)))







Downloading pytorch_model.bin: 100%|██████████| 440M/440M [01:19<00:00, 5.57MB/s]


: 

: 