week10 objectives:

1. extend emotion set to include arousal, valence and dominance
2. extract bag of words vectors for comment and target in the dataset for stance detection
3. extract bert embeddings and concatenate comment and target embeddings to form a feature set

In [1]:
import csv


f = open("data/train.csv", newline="")
reader = csv.reader(f, delimiter=',')
    
header = next(reader)

print(header[1 : ])

keys = {"post": 1, "ori_topic": 2, "new_topic": 4, "label": 5, "topic": 13, "contains_topic": 15}


['post', 'ori_topic', 'ori_id', 'new_topic', 'label', 'type_idx', 'new_id', 'arc_id', 'text', 'pos_text', 'text_s', 'topic', 'topic_str', 'seen?', 'contains_topic?']


In [2]:
rows = []

for row in reader:
        i = {}  # instance
        for k in keys.keys():
            i[k] = row[keys[k]]  # reverse mapping
        rows.append(i)

print(f"number of samples: {len(rows)}")

number of samples: 13477


In [3]:
rows[6]

{'post': "Absolutely it's needs to be defined and regulated in its use, as currently the word 'natural' when used on food products is totally confusing and meaningless. Clearly they are trying to imply the item is 'healthy' or possibly 'organic', but when you see food 'manufacturers' like Frito-Lay or Campbell's with products labelled 'natural', that alone should set off alarms that all is not what it seems. ;-)",
 'ori_topic': 'food labels',
 'new_topic': 'healthy',
 'label': '1',
 'topic': 'healthy',
 'contains_topic': '1'}

feature extraction

In [4]:
import re

# bag of words
sentences = ["Joe waited for the train", 
             "The train was late", 
             "Mary and Samantha took the bus", 
             "I looked for Mary and Samantha at the bus station",
             "Mary and Samantha arrived at the bus station early but waited until noon for the bus"]


In [5]:
def word_extraction(sentence):    
    ignore = ['a', "the", "is"]    
    words = re.sub("[^\w]", " ",  sentence).split()    
    
    cleaned_text = [w.lower() for w in words if w not in ignore]    
    
    return cleaned_text

text = word_extraction(sentences[-1])

print(text)

['mary', 'and', 'samantha', 'arrived', 'at', 'bus', 'station', 'early', 'but', 'waited', 'until', 'noon', 'for', 'bus']


In [6]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 

set(stopwords.words('english'))


def tokenize(sentences):    
    words = []    
    
    for sentence in sentences:        
        w = word_extraction(sentence)        
        words.extend(w)            
        words = sorted(list(set(words)))    
        
    return words

words = tokenize(sentences)

print(words)

['and', 'arrived', 'at', 'bus', 'but', 'early', 'for', 'i', 'joe', 'late', 'looked', 'mary', 'noon', 'samantha', 'station', 'the', 'took', 'train', 'until', 'waited', 'was']


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kxs1207/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# %pip install -q scikit-learn

In [9]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

print(X.toarray().shape)


(5, 20)


In [7]:
import numpy as np

# feature extraction
data = np.array([[i["post"], i["topic"], i["contains_topic"], i["label"]] for i in rows])

print(data.shape)

comments = list(data[ : , 0])
topics = list(data[ : , 1])

(13477, 4)


In [10]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
X1 = vectorizer.fit_transform(comments)
X2 = vectorizer.fit_transform(topics)

print(X1.shape, X2.shape)

X = np.concatenate((X1.toarray(), X2.toarray()), axis=1)

(13477, 10000) (13477, 2786)


In [19]:
# %pip install -q tensorflow-text

[31mERROR: Could not install packages due to an OSError: [Errno 16] Device or resource busy: '.panfs.d4dfa8c0.1693230459922738047'
[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [24]:
# %pip install -q bertopic

Note: you may need to restart the kernel to use updated packages.


In [11]:
from transformers.utils import logging
# disable tqdm
logging.disable_progress_bar()

In [17]:
from sentence_transformers import SentenceTransformer


# paraphrase-multilingual-mpnet-base-v2

sentenc_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

main: extract features with bag of words and bert separately, concatenate comment and target features to form a feature set

week11 objectives:

1. train a baseline on bag of words
2. compare performance when trained on bert embeddings

In [12]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(X, list(data[ : , -1]), test_size=0.33, random_state=42)

In [13]:
# traditional
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, verbose=0).fit(x_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
from sklearn.metrics import accuracy_score

clf_predictions = clf.predict(x_test)

print(accuracy_score(y_test, clf_predictions) * 100)

56.969424460431654


In [22]:
from sklearn.decomposition import PCA
# pipeline
from sklearn.pipeline import make_pipeline


pipe = make_pipeline(PCA(n_components=2000), LogisticRegression(random_state=0, verbose=0))

pipe.fit(x_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
pipe.score(x_test, y_test)

0.5714928057553957

In [25]:
import pickle

with open('lr.p', 'wb') as f:
    pickle.dump((vectorizer, clf), f)