<a href="https://colab.research.google.com/github/hyeamykim/nlp-problems/blob/master/sentiment_analysis_debiased_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment analysis with biased and debiased word embeddings 



> Prepared by Hye Yeon Kim

> Based on Reponsibly tutorial on exploring gender bias in word embeddings

> https://learn.responsibly.ai/word-embedding



> Last modified: 16/8/2021



## Pre-processing

### Install Responsibly package



In [None]:
%pip install --user responsibly



In [None]:
import responsibly

# You should get '0.1.3'
responsibly.__version__

'0.1.3'

### Import sentiment analysis data from SemEval2018 task 1

In [None]:
%%bash

wget https://learn.responsibly.ai/word-embedding/data/SemEval2018-Task1-all-data.zip \
     -O SemEval2018-Task1-all-data.zip -q

unzip -qq -o SemEval2018-Task1-all-data.zip -d ./data

In [None]:
import pandas as pd


train_df = pd.read_csv('./data/SemEval2018-Task1-all-data/English/V-reg/2018-Valence-reg-En-train.txt',
                       sep='\t', index_col=0)
dev_df = pd.read_csv('./data/SemEval2018-Task1-all-data/English/V-reg/2018-Valence-reg-En-dev.txt',
                       sep='\t', index_col=0)
test_df = pd.read_csv('./data/SemEval2018-Task1-all-data/English/V-reg/2018-Valence-reg-En-test-gold.txt',
                       sep='\t', index_col=0)

### Data Inspection and Pre-processing on labels

In [None]:
# A few examples

train_df.head()

Unnamed: 0_level_0,Tweet,Affect Dimension,Intensity Score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-En-30153,@liamch88 yeah! :) playing well,valence,0.6
2017-En-40929,At least I don't have a guy trying to discoura...,valence,0.484
2017-En-22012,UPLIFT: If you're still discouraged it means y...,valence,0.563
2017-En-30837,"...at your age, the heyday in the blood is tam...",valence,0.45
2017-En-30838,i was so embarrassed when she saw us i was lik...,valence,0.233


In [None]:
train_df.isnull().sum()

Tweet               0
Affect Dimension    0
Intensity Score     0
dtype: int64

In [None]:
# Convert all the labels from real numbers into boolean values,
# setting the threshold at 0.5, and creating a new column named
# `label`

train_df['label'] = train_df['Intensity Score'] > 0.5 
dev_df['label'] = dev_df['Intensity Score'] > 0.5
test_df['label'] = test_df['Intensity Score'] > 0.5

In [None]:
# helper function to convert categorical label to binary label
def int_cast(s):
  return int(s == True)

In [None]:
train_df['binary label'] = train_df['label'].apply(lambda x: int_cast(x))

In [None]:
dev_df['binary label'] = dev_df['label'].apply(lambda x: int_cast(x))
test_df['binary label'] = test_df['label'].apply(lambda x: int_cast(x))

In [None]:
train_df.head()

Unnamed: 0_level_0,Tweet,Affect Dimension,Intensity Score,label,binary label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-En-30153,@liamch88 yeah! :) playing well,valence,0.6,True,1
2017-En-40929,At least I don't have a guy trying to discoura...,valence,0.484,False,0
2017-En-22012,UPLIFT: If you're still discouraged it means y...,valence,0.563,True,1
2017-En-30837,"...at your age, the heyday in the blood is tam...",valence,0.45,False,0
2017-En-30838,i was so embarrassed when she saw us i was lik...,valence,0.233,False,0


### Prepare word embeddings: Word2Vec trained with Google News

In [None]:
%%bash

wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz \
     -O GoogleNews-vectors-negative300.bin.gz -q

In [None]:
from gensim.models import KeyedVectors

# Limit vocabulary to top-500K most frequent words
VOCAB_SIZE = 500000

# Load the word2vec
w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',
                                              binary=True,
                                              limit=VOCAB_SIZE)

In [None]:
# Get the vector embedding for a word
#w2v_model['home']

In [None]:
len(w2v_model['home'])

300

In [None]:
# Check whether there is an embedding for a word
'bazinga' in w2v_model

False

### Pre-processing on word features

In [None]:
from gensim.parsing.preprocessing import (preprocess_string,
                                          strip_tags,
                                          strip_punctuation,
                                          strip_multiple_whitespaces,
                                          strip_numeric,
                                          remove_stopwords)


# We pick a subset of the default filters,
# in particular, we do not take
# strip_short() and stem_text().
FILTERS = [strip_punctuation,
           strip_tags,
           strip_multiple_whitespaces,
           strip_numeric,
           remove_stopwords]

# See how the sentece is transformed into tokes (words)
preprocess_string('This is a "short" text!', FILTERS)

['This', 'short', 'text']

In [None]:
import numpy as np

In [None]:
def generate_text_features(text, w2v):

    processed_text = preprocess_string(text, FILTERS)
    
    words = [word for word in processed_text if word in w2v_model]

    if len(words) >= 1:
        return np.mean(w2v_model[words], axis=0)
    else:
        return []
    

In [None]:
train_df['Embeddings'] = train_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_model))

In [None]:
dev_df['Embeddings'] = dev_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_model))
test_df['Embeddings'] = test_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_model))

In [None]:
num_emp = 0
num_row = 0
for emb in train_df['Embeddings']:
  num_row+=1
  if emb ==[]:
    num_emp +=1
    print(num_row)


947




In [None]:
num_emp

1

In [None]:
train_df.describe

<bound method NDFrame.describe of                                                            Tweet  ...                                         Embeddings
ID                                                                ...                                                   
2017-En-30153                   @liamch88 yeah! :) playing well   ...  [0.1899414, 0.12072754, 0.008056641, 0.2116699...
2017-En-40929  At least I don't have a guy trying to discoura...  ...  [0.037312824, 0.05788676, -0.009847005, 0.1191...
2017-En-22012  UPLIFT: If you're still discouraged it means y...  ...  [0.058245342, 0.033650715, 0.059013367, -0.016...
2017-En-30837  ...at your age, the heyday in the blood is tam...  ...  [0.025488282, 0.1763672, -0.06201172, -0.03115...
2017-En-30838  i was so embarrassed when she saw us i was lik...  ...  [0.0058166506, 0.11234741, 0.020368958, 0.0539...
...                                                          ...  ...                                                ..

In [None]:
train_df.iloc[946]

Tweet               @simmy_hanley @Schrise also a 
Affect Dimension                           valence
Intensity Score                              0.452
label                                        False
binary label                                     0
Embeddings                                      []
Name: 2017-En-31529, dtype: object

In [None]:
train_df.index[946]

'2017-En-31529'

In [None]:
train_df.drop(['2017-En-31529'], inplace=True)
train_df.iloc[946]

Tweet                      gifs on iOS10 messaging app are hilarious.
Affect Dimension                                              valence
Intensity Score                                                 0.654
label                                                            True
binary label                                                        1
Embeddings          [0.15008545, -0.22180176, -0.19128418, 0.05357...
Name: 2017-En-31268, dtype: object

In [None]:
dev_df.describe

<bound method NDFrame.describe of                                                            Tweet  ...                                         Embeddings
ID                                                                ...                                                   
2018-En-02354  So @Ryanair site crashes everytime I try to bo...  ...  [0.07304382, 0.051412582, 0.03186035, 0.063056...
2018-En-00124  Theme of week: Ask the Lord for strength &amp;...  ...  [0.042944226, 0.1165717, -0.000113351, -0.0062...
2018-En-02603  @F1 Why announcing so late, it will be hard to...  ...  [0.052856445, 0.08341217, 0.075790405, 0.11111...
2018-En-00450  The greatest happiness is seeing someone you l...  ...  [0.03924942, 0.103393555, 0.053723335, 0.12658...
2018-En-00502  omg so grateful to have an education but ive b...  ...  [-0.078271486, 0.09213257, -0.052264404, 0.120...
...                                                          ...  ...                                                ..

In [None]:
num_emp = 0
num_row = 0
for emb in dev_df['Embeddings']:
  num_row+=1
  if emb ==[]:
    num_emp +=1
    print(num_row)



In [None]:
dev_df.iloc[375]

Tweet               @iamjpk @NameisNani what the phd!  eesav gatti...
Affect Dimension                                              valence
Intensity Score                                                 0.714
label                                                            True
binary label                                                        1
Embeddings                                                         []
Name: 2018-En-02255, dtype: object

In [None]:
dev_df.index[375]

'2018-En-02255'

In [None]:
dev_df.drop(['2018-En-02255'], inplace=True)
dev_df.iloc[375]

Tweet               @candycmarketing What happens if you don't wan...
Affect Dimension                                              valence
Intensity Score                                                 0.783
label                                                            True
binary label                                                        1
Embeddings          [-0.0006781684, 0.07823096, 0.031304255, 0.129...
Name: 2018-En-00321, dtype: object

In [None]:
test_df.describe

<bound method NDFrame.describe of                                                            Tweet  ...                                         Embeddings
ID                                                                ...                                                   
2018-En-01964                           Gm and have a  #Tuesday!  ...  [0.072021484, 0.13427734, 0.17773438, -0.07031...
2018-En-01539  @realDonaldTrump But you have a lot of time fo...  ...  [0.059179686, 0.1439209, -0.0057006837, 0.0945...
2018-En-04235  I graduated yesterday and already had 8 family...  ...  [0.028259277, 0.042824484, -0.035000887, 0.048...
2018-En-03711  @jaimitoelcrack7 Seriously...I've been sitting...  ...  [0.037365723, 0.11878662, 0.040841676, 0.10423...
2018-En-01177  Whether my glass is half empty or its half ful...  ...  [-0.04997762, 0.070887245, 0.012512207, 0.1172...
...                                                          ...  ...                                                ..

In [None]:
num_emp = 0
num_row = 0
for emb in test_df['Embeddings']:
  num_row+=1
  if emb ==[]:
    num_emp +=1
    print(num_row)

626
678
898




In [None]:
test_df.index[[626,678,898]]

Index(['2018-En-03503', '2018-En-03216', '2018-En-01281'], dtype='object', name='ID')

In [None]:
test_df.drop(['2018-En-03503', '2018-En-03216', '2018-En-01281'], inplace=True)
test_df.iloc[[626,678,898]]

Unnamed: 0_level_0,Tweet,Affect Dimension,Intensity Score,label,binary label,Embeddings
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-En-01351,@alb353 @thehugheslady @TelegraphNews @paulwes...,valence,0.391,False,0,"[0.15799968, -0.0013020834, 0.12556966, 0.0416..."
2018-En-02337,"Mothafuckas wanna adopt the dark, but I was bo...",valence,0.3,False,0,"[0.04477539, 0.0892273, 0.048217773, 0.1174804..."
2018-En-04060,@dp_srk_rk It's going 3 only. He has berdych t...,valence,0.468,False,0,"[-0.00423584, 0.072229005, 0.084509276, 0.0425..."


In [None]:
train_df['Embeddings'].tail()

ID
2017-En-30999    [0.026423644, 0.013881429, 0.020096842, 0.0671...
2017-En-40498    [0.036458332, 0.041422527, -0.0538737, 0.07812...
2017-En-30436    [0.12270508, 0.048535157, 0.13098145, 0.085293...
2017-En-30584    [0.16064453, 0.015258789, 0.08691406, 0.175048...
2017-En-21917    [0.06644694, 0.1110433, 0.059783936, 0.0276082...
Name: Embeddings, dtype: object

In [None]:
train_df['Embeddings']

### Converting word embeddings from pandas series to numpy arrays



In [None]:
# helper function that takes word embeddings in dataframe and returns numpy ndarrays

def convert_to_array(df):
  embeddings = []
  for index, val in df['Embeddings'].items():
    embeddings.append(val)
  arr = np.array(embeddings[0])
  for elem in embeddings[1:]:
    arr2 = np.array(elem)
    if len(arr2)==300:
      arr = np.vstack((arr,arr2))
  
  return arr

In [None]:
def convert_to_array(df, model_name):
  embeddings = []
  for index, val in df['Embeddings_'+model_name].items():
    embeddings.append(val)
  arr = np.array(embeddings[0])
  for elem in embeddings[1:]:
    arr2 = np.array(elem)
    if len(arr2)==300:
      arr = np.vstack((arr,arr2))
  
  return arr

## Modeling

### Model #1: Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
train_embeddings = convert_to_array(train_df)

In [None]:
clf_lr = LogisticRegression(random_state=0, penalty="l2", C=0.1).fit(train_embeddings, train_df['binary label'])

In [None]:
dev_embeddings = convert_to_array(dev_df)

In [None]:
# clf_lr.predict(dev_embeddings)

In [None]:
# clf_lr.predict_proba(dev_embeddings)

In [None]:
# validation score
clf_lr.score(dev_embeddings, dev_df['binary label'])

0.765625

#### Scores

In [None]:
test_embeddings = convert_to_array(test_df)

In [None]:
test_embeddings.shape

(931, 300)

In [None]:
len(test_embeddings)

931

In [None]:
len(test_df['binary label'])

934

In [None]:
# test score
clf_lr.score(test_embeddings, test_df['binary label'][1:len(test_embeddings)+1])

0.5091299677765844

### Model #2: Support Vector Machine

In [None]:
from sklearn import svm

clf_svm = svm.SVC()
clf_svm.fit(train_embeddings, train_df['binary label'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
# validation score
clf_svm.score(dev_embeddings, dev_df['binary label'])

0.7834821428571429

In [None]:
clf_svm.score(test_embeddings, test_df['binary label'][1:len(test_embeddings)+1])

0.5005370569280344

### Model #3: Using TextBlob

In [None]:
from textblob import TextBlob

In [None]:
# compute sentiment scores (polarity) and labels
sentiment_scores_tb = [round(TextBlob(tweet).sentiment.polarity, 3) for tweet in train_df['Tweet']]
sentiment_category_tb = ['positive' if score > 0 
                             else 'negative' if score < 0 
                                 else 'neutral' 
                                     for score in sentiment_scores_tb]

In [None]:
# sentiment statistics per news category
df = pd.DataFrame([list(train_df['Tweet']), sentiment_scores_tb, sentiment_category_tb]).T
df.columns = ['Tweet', 'sentiment_score', 'sentiment_category']
df['sentiment_score'] = df.sentiment_score.astype('float')
#df.groupby(by=['news_category']).describe()

In [None]:
df.head()

Unnamed: 0,Tweet,sentiment_score,sentiment_category
0,@liamch88 yeah! :) playing well,0.5,positive
1,At least I don't have a guy trying to discoura...,0.0,neutral
2,UPLIFT: If you're still discouraged it means y...,-0.562,negative
3,"...at your age, the heyday in the blood is tam...",-0.217,negative
4,i was so embarrassed when she saw us i was lik...,0.4,positive


### Model #4: RNN, LSTM

In [None]:
%tensorflow_version 2.x  # this line is not required unless you are in a notebook
# from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
#import numpy as np

#VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x  # this line is not required unless you are in a notebook`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [None]:
# usually a sequence is padded to be kept at same length, but in this case, not necessary

#train_data = sequence.pad_sequences(train_data, MAXLEN)
#test_data = sequence.pad_sequences(test_data, MAXLEN)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(300, 64),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          19200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 52,289
Trainable params: 52,289
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['acc'])

history = model.fit(train_embeddings, train_df['binary label'], epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
results = model.evaluate(test_embeddings, test_df['binary label'].values[1:len(test_embeddings)+1])
print(results)

[0.6942142844200134, 0.48549947142601013]


## With different word embeddings: FastText


In [None]:
from gensim import downloader

In [None]:
fasttext_path = downloader.load('fasttext-wiki-news-subwords-300', return_path=True)
print(fasttext_path)

fasttext_model = KeyedVectors.load_word2vec_format(fasttext_path)

/root/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz


In [None]:
train_df['Embeddings_fasttext'] = train_df['Tweet'].apply(lambda x: generate_text_features(x, fasttext_model))
dev_df['Embeddings_fasttext'] = dev_df['Tweet'].apply(lambda x: generate_text_features(x, fasttext_model))
test_df['Embeddings_fasttext'] = test_df['Tweet'].apply(lambda x: generate_text_features(x, fasttext_model))

In [None]:
train_df['Embeddings_fasttext'].head()

ID
2017-En-30153    [0.1899414, 0.12072754, 0.008056641, 0.2116699...
2017-En-40929    [0.037312824, 0.05788676, -0.009847005, 0.1191...
2017-En-22012    [0.058245342, 0.033650715, 0.059013367, -0.016...
2017-En-30837    [0.025488282, 0.1763672, -0.06201172, -0.03115...
2017-En-30838    [0.0058166506, 0.11234741, 0.020368958, 0.0539...
Name: Embeddings_fasttext, dtype: object

In [None]:
def convert_to_array(df, model_name):
  embeddings = []
  for index, val in df['Embeddings_'+model_name].items():
    embeddings.append(val)
  arr = np.array(embeddings[0])
  for elem in embeddings[1:]:
    arr2 = np.array(elem)
    if len(arr2)==300:
      arr = np.vstack((arr,arr2))
  
  return arr

In [None]:
model_name = 'fasttext'

In [None]:
train_embeddings = convert_to_array(train_df, model_name)

In [None]:
len(train_embeddings)

1180

### Results with FastText

In [None]:
clf = LogisticRegression(random_state=0).fit(train_embeddings, train_df['binary label'].values[1:])

In [None]:
dev_embeddings = convert_to_array(dev_df, model_name)
test_embeddings = convert_to_array(test_df, model_name)

In [None]:
clf.score(train_embeddings,train_df['binary label'].values[1:])

0.6974576271186441

In [None]:
clf.score(dev_embeddings,dev_df['binary label'].values[1:])

0.5580357142857143

In [None]:
clf.score(test_embeddings,test_df['binary label'].values[1:len(test_embeddings)+1])

0.5192719486081371

## With different word embedding: debiased Word2Vec



In [None]:
from responsibly.we import GenderBiasWE, most_similar

In [None]:
w2v_gender_bias_we = GenderBiasWE(w2v_model)

In [None]:
train_df['Embeddings_w2v_gender_bias'] = train_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_gender_bias_we))
dev_df['Embeddings_w2v_gender_bias'] = dev_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_gender_bias_we))
test_df['Embeddings_w2v_gender_bias'] = test_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_gender_bias_we))

In [None]:
model_name = 'w2v_gender_bias'
train_embeddings = convert_to_array(train_df, model_name)

### Results with Debiased Word2Vec

In [None]:
clf = LogisticRegression(random_state=0).fit(train_embeddings, train_df['binary label'].values[1:])

In [None]:
dev_embeddings = convert_to_array(dev_df, model_name)
test_embeddings = convert_to_array(test_df, model_name)

In [None]:
clf.score(dev_embeddings,dev_df['binary label'].values[1:])

0.5625

In [None]:
clf.score(test_embeddings,test_df['binary label'].values[1:len(test_embeddings)+1])

0.5246252676659529

In [None]:
w2v_gender_debias = w2v_gender_bias_we.debias(method='neutralize', inplace=False)

In [None]:
train_df['Embeddings_w2v_gender_debias'] = train_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_gender_debias))
dev_df['Embeddings_w2v_gender_debias'] = dev_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_gender_debias))
test_df['Embeddings_w2v_gender_debias'] = test_df['Tweet'].apply(lambda x: generate_text_features(x, w2v_gender_debias))

In [None]:
model_name = 'w2v_gender_debias'
train_embeddings = convert_to_array(train_df, model_name)

In [None]:
clf = LogisticRegression(random_state=0).fit(train_embeddings, train_df['binary label'].values[1:])

In [None]:
dev_embeddings = convert_to_array(dev_df, model_name)
test_embeddings = convert_to_array(test_df, model_name)

In [None]:
clf.score(dev_embeddings,dev_df['binary label'].values[1:])

0.5625

In [None]:
clf.score(test_embeddings,test_df['binary label'].values[1:len(test_embeddings)+1])

0.5246252676659529