## Final Project

In [None]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt 
import seaborn as sn 
from sklearn.metrics import classification_report
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import gensim
import nltk
import numpy as np
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim import corpora, models


Dataset Loading

In [None]:
movie_dataset = pd.read_csv("Movies.csv").dropna().reset_index(drop=True)
book_dataset = pd.read_csv("Books.csv").dropna().reset_index(drop=True)
restaurant_dataset = pd.read_table("Restaurants.tsv").dropna().reset_index(drop=True)
movie_dataset["topic"] = "movie"
book_dataset["topic"] = "book"
restaurant_dataset["topic"] = "restaurant"
movie_dataset["text"] = movie_dataset["review_content"]
book_dataset["text"] = book_dataset["review/text"]
restaurant_dataset["text"] = restaurant_dataset["Review"]

combined_datasets = pd.concat([movie_dataset[["text", "topic"]][0:1000], book_dataset[["text", "topic"]][0:1000], restaurant_dataset[["text", "topic"]][0:1000]]).reset_index(drop=True)

test_NER_file = pd.read_table("NER-final-test.tsv")
test_sentiment_file = pd.read_table("sentiment-topic-final-test.tsv")

NER Tagging

In [None]:
from nltk.corpus.reader import ConllCorpusReader
### Adapt the path to point to the CONLL2003 folder on your local machine
train = ConllCorpusReader('CONLL2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk'])
training_features = []
training_gold_labels = []
for token, pos, ne_label in train.iob_words():
   a_dict = {
       "words":token, "pos":pos
    }
   training_features.append(a_dict)
   training_gold_labels.append(ne_label)

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
tokens = test_NER_file['token'].values.tolist()
pos_tags_per_sentence = []
tagged = nltk.pos_tag(tokens)
pos_tags_per_sentence.append(tagged)

In [None]:
tags = []
for item in pos_tags_per_sentence:
    for token, tag in item:
        tags.append(tag)
test_NER_file['pos'] = tags

In [None]:
test_features = []
test_gold_labels = test_NER_file['BIO NER tag']
test_set_final = list(zip(test_NER_file['token'], test_NER_file['pos'], test_NER_file['BIO NER tag']))
for item in test_set_final:
    a_dict = {
        "words":item[0], "pos":item[1]
    }
    test_features.append(a_dict)

In [None]:
# Function that processes the data into sentences
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["token"].values.tolist(),
                                                           s["pos"].values.tolist(),
                                                           s["BIO NER tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(test_NER_file)
sent = getter.get_next()
test_sentences = getter.sentences

In [None]:
train.iob_sents()

[[], [('EU', 'NNP', 'B-ORG'), ('rejects', 'VBZ', 'O'), ('German', 'JJ', 'B-MISC'), ('call', 'NN', 'O'), ('to', 'TO', 'O'), ('boycott', 'VB', 'O'), ('British', 'JJ', 'B-MISC'), ('lamb', 'NN', 'O'), ('.', '.', 'O')], ...]

Feature engeenering

In [None]:
# input is a sentence as a structure show above 
#and and ith word from the sentence to return the features for that word

def word2features(sent, i):
    if len(sent) > 0:
      word = sent[i][0]
      postag = sent[i][1]
      
      # data structure consisting of a feature name and value for the token
      features = {
          'bias': 1.0,
          'word.lower()': word.lower(), # lower case variant of the token
          'word[-3:]': word[-3:], #suffix of 3 characters
          'word[-2:]': word[-2:], #suffix of 2 characters
          'word.isupper()': word.isupper(), # initial captial
          'word.istitle()': word.istitle(), # all words ini caps
          'word.isdigit()': word.isdigit(),
          'postag': postag,
          'postag[:2]': postag[:2], #first two characters of the PoS Tag
      }
      if i > 0:
          # adding features for the word based on the previous word
          word1 = sent[i-1][0] # previous word
          postag1 = sent[i-1][1]
          features.update({
              '-1:word.lower()': word1.lower(),
              '-1:word.istitle()': word1.istitle(),
              '-1:word.isupper()': word1.isupper(),
              '-1:postag': postag1,
              '-1:postag[:2]': postag1[:2],
          })
      else:
          features['BOS'] = True # Beginning of sentence as a feature

      if i < len(sent)-1:
          # adding features for the word based on the next word
          word1 = sent[i+1][0] # next word
          postag1 = sent[i+1][1]
          features.update({
              '+1:word.lower()': word1.lower(),
              '+1:word.istitle()': word1.istitle(),
              '+1:word.isupper()': word1.isupper(),
              '+1:postag': postag1,
              '+1:postag[:2]': postag1[:2],
          })
      else:
          features['EOS'] = True # end of sentence as a feature
    else:
      features = []
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git
  Cloning https://github.com/MeMartijn/updated-sklearn-crfsuite.git to /tmp/pip-req-build-nmhawrt4
  Running command git clone --filter=blob:none --quiet https://github.com/MeMartijn/updated-sklearn-crfsuite.git /tmp/pip-req-build-nmhawrt4
  Resolved https://github.com/MeMartijn/updated-sklearn-crfsuite.git to commit 675038761b4405f04691a83339d04903790e2b95
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sklearn-crfsuite
  Building wheel for sklearn-crfsuite (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn

In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn.metrics import classification_report

crf = CRF(algorithm='l2sgd',
          max_iterations=100,
          all_possible_transitions=False)


In [None]:
train_inputs = [sent2features(s) for s in train.iob_sents()]
train_outputs = [sent2labels(s) for s in train.iob_sents()]
test_inputs = [sent2features(s) for s in test_sentences]
test_outputs = [sent2labels(s) for s in test_sentences]

In [None]:
crf.fit(X=train_inputs, y=train_outputs)

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report
pred = crf.predict(test_inputs)
report = flat_classification_report(y_pred=pred, y_true=test_outputs)
print(report)

              precision    recall  f1-score   support

       B-LOC       1.00      1.00      1.00         4
      B-MISC       1.00      1.00      1.00         3
       B-ORG       0.75      0.75      0.75         4
       B-PER       0.67      0.67      0.67         6
       I-LOC       1.00      1.00      1.00         2
      I-MISC       1.00      1.00      1.00         1
       I-ORG       0.60      1.00      0.75         3
       I-PER       1.00      0.62      0.77         8
           O       0.99      1.00      1.00       183

    accuracy                           0.97       214
   macro avg       0.89      0.89      0.88       214
weighted avg       0.98      0.97      0.97       214



Sentiment Analysis

In [None]:
import sklearn
from sklearn import metrics
from sklearn.metrics import classification_report
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
vader_model = SentimentIntensityAnalyzer()
nlp = spacy.load('en_core_web_sm') # 'en_core_web_sm'

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
def vader_output_to_label(vader_output):
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'

In [None]:
reviews = []
all_vader_output = []
gold = []

# settings (to change for different experiments)
to_lemmatize = True 
pos = set()

for index, row in test_sentiment_file.iterrows():
  vader_output = vader_model.polarity_scores(row['text'])# run vader
  vader_label = vader_output_to_label(vader_output)# convert vader output to category
  
  reviews.append(row['text'])
  all_vader_output.append(vader_label)
  gold.append(row['sentiment'])
    
report = classification_report(gold,all_vader_output)
print(report)

              precision    recall  f1-score   support

    negative       1.00      0.33      0.50         3
     neutral       1.00      0.33      0.50         3
    positive       0.50      1.00      0.67         4

    accuracy                           0.60        10
   macro avg       0.83      0.56      0.56        10
weighted avg       0.80      0.60      0.57        10



In [None]:
#Error analysis // getting all wrongly classified tweets
for (review, vader_output, manual_label) in zip(reviews, all_vader_output, gold):
    print("Tweet: " + review)
    print("VADER: " + vader_output)
    print("My label: " + manual_label)
    print()

Tweet: It took eight years for Warner Brothers to recover from the disaster that was this movie.
VADER: negative
My label: negative

Tweet: All the New York University students love this diner in Soho so it makes for a fun young atmosphere.
VADER: positive
My label: positive

Tweet: This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food.
VADER: positive
My label: negative

Tweet: In conclusion, my review of this book would be: I like Jane Austen and understand why she is famous.
VADER: positive
My label: positive

Tweet: The story of this movie is focused on Carl Brashear played by Cuba Gooding Jr. who wants to be the first African American deep sea diver in the navy.
VADER: positive
My label: neutral

Tweet: Chris O'Donnell stated that while filming for this movie, he felt like he was in a toy commercial.
VADER: positive
My label: neutral

Tweet: My husband and I moved to Amsterdam 6 years ago and for as long as we have live

Topic Classification

RoBERTA

In [None]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 5
model_args.learning_rate = 1e-4

model = ClassificationModel('roberta', 'roberta-base', num_labels=3, args=model_args, use_cuda=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
combined_datasets['labels'] = combined_datasets['topic'].replace(['book','movie','restaurant'],[0,1,2]) 

In [None]:
train_1, dev_1 = train_test_split(combined_datasets, test_size=0.1, random_state=0, 
                               stratify=combined_datasets['labels'])

In [None]:
_, history = model.train_model(train_1, eval_df=dev_1) 

  0%|          | 0/2700 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/338 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/338 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/338 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/338 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/338 [00:00<?, ?it/s]

In [None]:
test_sentiment_file['labels'] = test_sentiment_file['topic'].replace(['book','movie','restaurant'],[0,1,2]) 

In [None]:
predicted, probabilities = model.predict(test_sentiment_file['text'].to_list())
test_sentiment_file['predicted'] = predicted
print(classification_report(test_sentiment_file['labels'], test_sentiment_file['predicted']))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



SVM model

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn import svm
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
def lemmatize_stemming(text):
    return lemmatizer.lemmatize(text)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
           # result.append(token)
            result.append(lemmatize_stemming(token))
    return result

In [None]:
test_sentiment_file['processed-text'] = test_sentiment_file['text'].map(preprocess)
combined_datasets['processed-text'] = combined_datasets['text'].map(preprocess)

In [None]:
train_labels = combined_datasets['topic']
train_features = []
for text in combined_datasets['processed-text']:
  a_dict = {
      'text':text
  }
  train_features.append(a_dict)

In [None]:
test_labels = test_sentiment_file['topic']
test_features = []
for text in test_sentiment_file['processed-text']:
  a_dict = {
      'text':text
  }
  test_features.append(a_dict)

In [None]:
vec = DictVectorizer()
the_array = vec.fit_transform(train_features + test_features)
train_array = the_array[:len(train_features)]
test_array = the_array[len(train_features):]

In [None]:
lin_clf = svm.LinearSVC()

In [None]:
lin_clf.fit(train_array, train_labels)



In [None]:
pred = lin_clf.predict(test_array)

In [None]:
report = classification_report(test_labels,pred,digits = 3)
print(report)

              precision    recall  f1-score   support

        book      1.000     0.500     0.667         2
       movie      1.000     1.000     1.000         5
  restaurant      0.750     1.000     0.857         3

    accuracy                          0.900        10
   macro avg      0.917     0.833     0.841        10
weighted avg      0.925     0.900     0.890        10

