In [38]:
# This code is supposed to represent how I chose between
# Linear Regression and Ridge regression
# And between different preprocessings ngram and embeddings

In [39]:
import pandas as pd
eb = pd.read_csv("emobank.csv", index_col=0 ,engine='python')

In [40]:
eb.reset_index(drop=True, inplace=True)
eb = eb.drop(labels='split', axis=1)

In [41]:
eb_list = eb['text'].values.tolist()

In [42]:
# Extract VAD values
y_V = eb['V']
y_A = eb['A']
y_D = eb['D']

In [43]:
# Based on https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
# Data cleaning

from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
import re

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'html.parser')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()
testing = eb_list
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))




In [44]:
eb_list_V = test_result
eb_list_A = test_result
eb_list_D = test_result

In [45]:
# Feature 
# Preprocessing function with Tf-idf

# Based on code from:
# https://developers.google.com/machine-learning/guides/text-classification/step-3

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts. This way no leaking of information is done.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

In [46]:
# Feature extraction

# This function is taken from 
# https://developers.google.com/machine-learning/guides/text-classification/step-3

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500

def sequence_vectorize(train_texts, val_texts):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index

In [47]:
# Split data

from sklearn.model_selection import train_test_split

X_Vtrain, X_Vtest, y_Vtrain, y_Vtest = train_test_split(eb_list_V, y_V, test_size=0.2)
X_Atrain, X_Atest, y_Atrain, y_Atest = train_test_split(eb_list_A, y_A, test_size=0.2)
X_Dtrain, X_Dtest, y_Dtrain, y_Dtest = train_test_split(eb_list_D, y_D, test_size=0.2)

In [48]:
# Preprocess the data with ngram
X_Vtrain, X_Vtest = ngram_vectorize(X_Vtrain, y_Vtrain, X_Vtest)
X_Atrain, X_Atest = ngram_vectorize(X_Atrain, y_Atrain, X_Atest)
X_Dtrain, X_Dtest = ngram_vectorize(X_Dtrain, y_Dtrain, X_Dtest)

In [49]:
# Split into dev set from training set so we get the same distribution
X_Vtrain, X_Vdev, y_Vtrain, y_Vdev = train_test_split(X_Vtrain, y_Vtrain, test_size=0.18)
X_Atrain, X_Adev, y_Atrain, y_Adev = train_test_split(X_Atrain, y_Atrain, test_size=0.18)
X_Dtrain, X_Ddev, y_Dtrain, y_Ddev = train_test_split(X_Dtrain, y_Dtrain, test_size=0.18)

In [50]:
# Preprocess data with embeddings
#X_Vtrain, X_Vtest, word_index_V = sequence_vectorize(X_Vtrain, X_Vtest)
#X_Atrain, X_Atest, word_index_A = sequence_vectorize(X_Atrain, X_Atest)
#X_Dtrain, X_Dtest, word_index_D = sequence_vectorize(X_Dtrain, X_Dtest)

In [51]:
# Split into dev set from training set so we get the same distribution
#X_Vtrain, X_Vdev, y_Vtrain, y_Vdev = train_test_split(X_Vtrain, y_Vtrain, test_size=0.18)
#X_Atrain, X_Adev, y_Atrain, y_Adev = train_test_split(X_Atrain, y_Atrain, test_size=0.18)
#X_Dtrain, X_Ddev, y_Dtrain, y_Ddev = train_test_split(X_Dtrain, y_Dtrain, test_size=0.18)

In [52]:
# Model training
from sklearn.linear_model import Ridge# Classifier: 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

clf1 = LinearRegression()
clf2 = LinearRegression()
clf3 = LinearRegression()
clf4 = Ridge()
clf5 = Ridge()
clf6 = Ridge()

# train model on data
clf1 = clf1.fit(X_Vtrain, y_Vtrain) 
clf2 = clf2.fit(X_Atrain, y_Atrain)
clf3 = clf3.fit(X_Dtrain, y_Dtrain) 
clf4 = clf4.fit(X_Vtrain, y_Vtrain) 
clf5 = clf5.fit(X_Atrain, y_Atrain)
clf6 = clf6.fit(X_Dtrain, y_Dtrain) 

print("TEST DATASET")
rslt_V_L = clf1.predict(X_Vtest)
print('V - LR')
print(rslt_V_L)
rslt_A_L = clf2.predict(X_Atest)
print('A - LR')
print(rslt_A_L)
rslt_D_L = clf3.predict(X_Dtest)
print('D - LR')
print(rslt_D_L)
rslt_V_R = clf4.predict(X_Vtest)
print('V - Ridge')
print(rslt_V_R)
rslt_A_R = clf5.predict(X_Atest)
print('A - Ridge')
print(rslt_A_R)
rslt_D_R = clf6.predict(X_Dtest)
print('D - Ridge')
print(rslt_D_R)

print("DEV DATASET")
rslt_dev_V_L = clf1.predict(X_Vdev)
print('V - LR')
print(rslt_dev_V_L)
rslt_dev_A_L = clf2.predict(X_Adev)
print('A - LR')
print(rslt_dev_A_L)
rslt_dev_D_L = clf3.predict(X_Ddev)
print('D - LR')
print(rslt_dev_D_L)
rslt_dev_V_R = clf4.predict(X_Vdev)
print('V - Ridge')
print(rslt_dev_V_R)
rslt_dev_A_R = clf5.predict(X_Adev)
print('A - Ridge')
print(rslt_dev_A_R)
rslt_dev_D_R = clf6.predict(X_Ddev)
print('D - Ridge')
print(rslt_dev_D_R)

print("TRAIN DATASET")
rslt_train_V_L = clf1.predict(X_Vtrain)
print('V - LR')
print(rslt_train_V_L)
rslt_train_A_L = clf2.predict(X_Atrain)
print('A - LR')
print(rslt_train_A_L)
rslt_train_D_L = clf3.predict(X_Dtrain)
print('D - LR')
print(rslt_train_D_L)
rslt_train_V_R = clf4.predict(X_Vtrain)
print('V - Ridge')
print(rslt_train_V_R)
rslt_train_A_R = clf5.predict(X_Atrain)
print('A - Ridge')
print(rslt_train_A_R)
rslt_train_D_R = clf6.predict(X_Dtrain)
print('D - Ridge')
print(rslt_train_D_R)

TEST DATASET
V - LR
[ 7.54925014e+03 -2.57138595e+02  3.10092267e+00 ... -1.15761911e+03
  4.32538116e+02  5.64990252e+02]
A - LR
[2.84115232e+00 8.53248467e+02 6.23719497e+03 ... 3.60925459e+02
 5.17700904e+02 3.62462686e+03]
D - LR
[3.09459608 3.70540558 3.05120309 ... 2.97981816 2.71343974 3.69326539]
V - Ridge
[2.8708389 2.8326027 3.0746799 ... 2.9857569 3.0555685 2.7819338]
A - Ridge
[2.9972575 3.075153  3.3334167 ... 2.9729784 3.0296562 3.0360131]
D - Ridge
[3.0842304 3.1119394 3.0998728 ... 2.987238  3.0038261 3.0756059]
DEV DATASET
V - LR
[-3303.26815626  1550.70937979 -2815.38536618 ...  3372.90514243
 -5900.76834704  -149.7770566 ]
A - LR
[ 4350.88992841  -777.62882734  -176.94656057 ...  6884.23581085
 11593.66775549 -4495.20836848]
D - LR
[3.0249512  3.18987934 3.03679531 ... 3.20040145 2.49181865 1.93669366]
V - Ridge
[3.0451865 2.907773  2.669166  ... 3.1469455 2.9960103 3.193222 ]
A - Ridge
[2.9835598 3.1777723 2.9957523 ... 3.0401835 3.0368035 3.1670716]
D - Ridge
[3.01

In [53]:
print("Ridge regression - test dataset")
print("Mean squared error for Valance: %.2f" % mean_squared_error(y_Vtest, rslt_V_R))
#print("Coefficient of determination Valence: %.2f" % r2_score(y_Vtest, rslt_V_R))
print("Mean squared error for Arousal: %.2f" % mean_squared_error(y_Atest, rslt_A_R))
#print("Coefficient of determination Arousal: %.2f" % r2_score(y_Atest, rslt_A_R))
print("Mean squared error for Dominance: %.2f" % mean_squared_error(y_Dtest, rslt_D_R))
#print("Coefficient of determination Dominance: %.2f" % r2_score(y_Dtest, rslt_D_R))

Ridge regression - test dataset
Mean squared error for Valance: 0.09
Mean squared error for Arousal: 0.06
Mean squared error for Dominance: 0.04


In [54]:
print("Linear regression - test dataset")
print("Mean squared error for Valance: %.2f" % mean_squared_error(y_Vtest, rslt_V_L))
#print("Coefficient of determination Valence: %.2f" % r2_score(y_Vtest, rslt_V))
print("Mean squared error for Arousal: %.2f" % mean_squared_error(y_Atest, rslt_A_L))
#print("Coefficient of determination Arousal: %.2f" % r2_score(y_Atest, rslt_A))
print("Mean squared error for Dominance: %.2f" % mean_squared_error(y_Dtest, rslt_D_L))
#print("Coefficient of determination Dominance: %.2f" % r2_score(y_Dtest, rslt_D))

Linear regression - test dataset
Mean squared error for Valance: 4345915.01
Mean squared error for Arousal: 40398304.30
Mean squared error for Dominance: 0.10


In [55]:
print("Ridge regression - train dataset")
print("Mean squared error for Valance: %.2f" % mean_squared_error(y_Vtrain, rslt_train_V_R))
#print("Coefficient of determination Valence: %.2f" % r2_score(y_Vtest, rslt_V))
print("Mean squared error for Arousal: %.2f" % mean_squared_error(y_Atrain, rslt_train_A_R))
#print("Coefficient of determination Arousal: %.2f" % r2_score(y_Atest, rslt_A))
print("Mean squared error for Dominance: %.2f" % mean_squared_error(y_Dtrain, rslt_train_D_R))
#print("Coefficient of determination Dominance: %.2f" % r2_score(y_Dtest, rslt_D))

Ridge regression - train dataset
Mean squared error for Valance: 0.03
Mean squared error for Arousal: 0.02
Mean squared error for Dominance: 0.02


In [56]:
print("Linear regression - train dataset")
print("Mean squared error for Valance: %.2f" % mean_squared_error(y_Vtrain, rslt_train_V_L))
#print("Coefficient of determination Valence: %.2f" % r2_score(y_Vtest, rslt_V))
print("Mean squared error for Arousal: %.2f" % mean_squared_error(y_Atrain, rslt_train_A_L))
#print("Coefficient of determination Arousal: %.2f" % r2_score(y_Atest, rslt_A))
print("Mean squared error for Dominance: %.2f" % mean_squared_error(y_Dtrain, rslt_train_D_L))
#print("Coefficient of determination Dominance: %.2f" % r2_score(y_Dtest, rslt_D))

Linear regression - train dataset
Mean squared error for Valance: 0.00
Mean squared error for Arousal: 0.00
Mean squared error for Dominance: 0.00


In [57]:
print("Ridge regression - dev dataset")
print("Mean squared error for Valance: %.2f" % mean_squared_error(y_Vdev, rslt_dev_V_R))
#print("Coefficient of determination Valence: %.2f" % r2_score(y_Vtest, rslt_V))
print("Mean squared error for Arousal: %.2f" % mean_squared_error(y_Adev, rslt_dev_A_R))
#print("Coefficient of determination Arousal: %.2f" % r2_score(y_Atest, rslt_A))
print("Mean squared error for Dominance: %.2f" % mean_squared_error(y_Ddev, rslt_dev_D_R))
#print("Coefficient of determination Dominance: %.2f" % r2_score(y_Dtest, rslt_D))

Ridge regression - dev dataset
Mean squared error for Valance: 0.09
Mean squared error for Arousal: 0.06
Mean squared error for Dominance: 0.04


In [58]:
print("Linear regression - train dataset")
print("Mean squared error for Valance: %.2f" % mean_squared_error(y_Vdev, rslt_dev_V_L))
#print("Coefficient of determination Valence: %.2f" % r2_score(y_Vtest, rslt_V))
print("Mean squared error for Arousal: %.2f" % mean_squared_error(y_Adev, rslt_dev_A_L))
#print("Coefficient of determination Arousal: %.2f" % r2_score(y_Atest, rslt_A))
print("Mean squared error for Dominance: %.2f" % mean_squared_error(y_Ddev, rslt_dev_D_L))
#print("Coefficient of determination Dominance: %.2f" % r2_score(y_Dtest, rslt_D))

Linear regression - train dataset
Mean squared error for Valance: 3557564.68
Mean squared error for Arousal: 29836631.64
Mean squared error for Dominance: 0.10
