In [1]:
import numpy as np
import pandas as pd
# import nltk
import re
import matplotlib.pyplot as plt
from json import JSONDecoder
from functools import partial
import json
from pprint import pprint
# from bs4 import BeautifulSoup
# from nltk.tokenize import WordPunctTokenizer
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer, LancasterStemmer
# import mxnet as mx
# from bert_embedding import BertEmbedding

## CROWD SOURCING DATA

In [7]:
datapath = '../datasets/wordsim/'
name = datapath.split('/')[-2]
df = pd.read_csv(datapath+'original.tsv', sep='\t')
instances = np.unique(df['orig_id'])
crowd_labels = []
true_labels = []

In [8]:
for instid in instances:
    inst_df = df.loc[df['orig_id'] == instid]
    worker_labels = inst_df['response'].values
    worker_labels = worker_labels/10 if name=='wordsim' else worker_labels
    crowd_labels.append(worker_labels)
    true_labels.append(inst_df['gold'].values[0])

crowd_labels = np.round(crowd_labels)
true_labels = np.asarray(true_labels)
if name=='wordsim':
    true_labels[true_labels < 2.5] = 0
    true_labels[true_labels > 2.5] = 1
assert true_labels.size == crowd_labels.shape[0]

# save the crowd & true labels
np.save(datapath+'crowd_labels.npy', crowd_labels)
np.save(datapath+'true_labels.npy', true_labels)

df = pd.read_csv(datapath+'features.txt', sep="\t", quoting=3, header=None)
data_features = []
for instid in instances:
    inst_df = df.loc[df[0] == instid]
    data_features.append(inst_df[1].values[0].split(' '))
data_features = np.asarray(data_features).astype(float)
np.save(datapath+'data_features.npy', data_features)

In [386]:
df.groupby(['worker'])['instance'].count()[120:]

worker
120    187
121    880
122     38
123      5
124    371
      ... 
463      9
464     84
465      4
466     32
467      7
Name: instance, Length: 348, dtype: int64

## CLL DATA & PRE-PROCESSING

In [2]:
# Preprocessing steps
stemmer = LancasterStemmer()

def decodeHTMLencoding(tweets):
    decoded_tweets = tweets.applymap(lambda tweet: BeautifulSoup(tweet, 'lxml').get_text())
    return decoded_tweets

def removeStopWords(text):
    stopw = stopwords.words('english')
    words = [word for word in text.split() if len(word) > 3 and not word in stopw]
    # get stems from words
    for i in range(len(words)):
        words[i] = stemmer.stem(words[i])
    return (" ".join(words)).strip()

def cleanTweets(tweets):
    # decode tweets from html tags
    cleaned_tweets = decodeHTMLencoding(tweets)
    # remove URLs that starts with http
    cleaned_tweets = cleaned_tweets.applymap(lambda tweet: re.sub(
    r'https?:\/\/(www\.)?[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '', tweet, flags=re.MULTILINE) )
    # remove URLs that does not start with http
    cleaned_tweets = cleaned_tweets.applymap(lambda tweet: re.sub(
    r'[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '', tweet, flags=re.MULTILINE))
    # remove @
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub(r'@[A-Za-z0-9_]+', '', tweet, flags=re.MULTILINE) )
    # remove #
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub(r'#[A-Za-z0-9_]+', '', tweet, flags=re.MULTILINE) )
    # remove RT
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub('RT ', '', tweet, flags=re.MULTILINE) )
    # remove symbols and numbers (i.e keep letters only)
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub("[^a-zA-Z]"," ",tweet, flags=re.MULTILINE) )
    #replace consecutive non-ASCII characters with a space
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub(r'[^\x00-\x7F]+'," ",tweet.lower(), flags=re.MULTILINE) )
    
    cleaned_tweets.drop_duplicates(inplace=True)
    cleaned_tweets.replace('', np.nan, inplace=True)
    cleaned_tweets.dropna(inplace=True)
    
    return cleaned_tweets

In [3]:
def get_text_vectors(tweets, model):
    # dataset should be a pandas dataframe
    dimension = 300
    data_array = np.empty(shape=[0, dimension])
    indexes = []
    
    for i, tweet in enumerate(tweets):
        words = tweet.split()
        if len(words) !=0:
            feature = 0
            for word in words:
                try:
                    feature += model[word]
                except:
                    pass
            feature /= len(words)
            try:
                if feature.size == dimension:  
                    data_array = np.append(data_array, [feature], axis=0)
                    indexes.append(i)
            except:
                continue
    indexes = np.asarray(indexes)
    assert indexes.size == data_array.shape[0]
    return data_array, indexes

In [3]:
def remove_indices(weak_signals):
    # remove indexes of tweets that do not have coverage
    indices = np.where(np.sum(weak_signals, axis=1) == -1*weak_signals.shape[1])[0]
    weak_signals = np.delete(weak_signals, indices, axis=0)
    
    return weak_signals, indices

In [5]:
df = pd.read_csv('../datasets/glove.42B.300d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove_model = {key: val.values for key, val in df.T.items()}

In [6]:
# test word vectors
from scipy import spatial
result = 1 - spatial.distance.cosine(glove_model['horrible'], glove_model['terrible'])
result

0.9358371614102348

In [2]:
def keyword_labeling(data, keywords, sentiment='pos'):
    mask = 1 if sentiment == 'pos' else 0
    weak_signals = []
    for terms in keywords:
        weak_signal = []
        for text in data:
            label=-1
            for word in terms:
                if word in text.lower():
                    label = mask
            weak_signal.append(label)
        weak_signals.append(weak_signal)
    return np.asarray(weak_signals).T

POSITIVE_LABELS =  [['good','great','nice','delight','wonderful'], 
                    ['love', 'best', 'genuine','well', 'thriller'], 
                    ['clever','enjoy','fine','deliver','fascinating'], 
                    ['super','excellent','charming','pleasure','strong'], 
                    ['fresh','comedy', 'interesting','fun','entertain', 'charm', 'clever'], 
                    ['amazing','romantic','intelligent','classic','stunning'],
                    ['rich','compelling','delicious', 'intriguing','smart']]

NEGATIVE_LABELS = [['bad','better','leave','never','disaster'], 
                   ['nothing','action','fail','suck','difficult'], 
                   ['mess','dull','dumb', 'bland','outrageous'], 
                   ['slow', 'terrible', 'boring', 'insult','weird','damn'],
#                    ['drag','awful','waste', 'flat','worse'],
                   ['drag','no','not','awful','waste', 'flat'], 
                   ['horrible','ridiculous','stupid', 'annoying','painful'], 
                   ['poor','pathetic','pointless','offensive','silly']]

In [37]:
# # Use deep models (1D cov-net or bi-lstm to run experiments)
# from keras.preprocessing import sequence
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Bidirectional
# from keras.layers import Conv1D, GlobalMaxPooling1D
# from numpy import array
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Model, Input

# # fit a tokenizer
# def create_tokenizer(lines):
#     tokenizer = Tokenizer()
#     tokenizer.fit_on_texts(lines)
#     return tokenizer
 
# # calculate the maximum document length
# def max_length(lines):
#     return max([len(s.split()) for s in lines])
 
# # encode a list of lines
# def encode_text(tokenizer, lines, length):
#     # integer encode
#     encoded = tokenizer.texts_to_sequences(lines)
#     # pad encoded sequences
#     padded = pad_sequences(encoded, maxlen=length, padding='post')
#     return padded

# def text_Conv1D(max_features, maxlen):
#     filters = 32
#     kernel_size = 8
#     hidden_dims = 10
    
#     model = Sequential()

#     # we start off with an efficient embedding layer which maps
#     # our vocab indices into embedding_dims (100) dimensions
#     model.add(Embedding(max_features,
#                         100,
#                         input_length=maxlen))
#     model.add(Dropout(0.2))

#     # we add a Convolution1D, which will learn filters
#     # word group filters of size filter_length:
#     model.add(Conv1D(filters=filters,
#                      kernel_size=kernel_size,
#                      padding='valid',
#                      activation='relu',
#                      strides=1))
#     # we use max pooling:
#     model.add(GlobalMaxPooling1D())

#     # We add a vanilla hidden layer:
#     model.add(Dense(hidden_dims))
#     model.add(Dropout(0.2))
#     model.add(Activation('relu'))
    
# #     # use bi-LSTM instead
# #     model.add(Bidirectional(LSTM(64)))
# #     model.add(Dropout(0.5))
    
#     model.add(Dense(1, activation='sigmoid'))

#     model.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
    
#     return model

# YELP

In [3]:
datapath = '../datasets/yelp/'
size = 10000
review = pd.read_json(datapath+'yelp_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [4]:
# There are multiple chunks to be read
count=0
chunk_list = []
for chunk_review in review:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','user_id','useful','funny','cool','business_id','date'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    # chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    # chunk_merged = pd.merge(business_RV, chunk_review, on='business_id', how='inner')
    # Show feedback on progress
    # print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    # chunk_list.append(chunk_merged)
    chunk_list.append(chunk_review)
    count +=1
    if count==6:
        break
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [5]:
csv_name = datapath+"yelp_reviews.csv"
df.to_csv(csv_name, index=False)

df.head()

Unnamed: 0,stars,text
0,2,"As someone who has worked with many museums, I..."
1,1,I am actually horrified this place is still in...
2,5,I love Deagan's. I do. I really do. The atmosp...
3,1,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g..."
4,4,"Oh happy day, finally have a Canes near my cas..."


In [4]:
positive_labels = keyword_labeling(df.text.values, POSITIVE_LABELS, sentiment='pos')
negative_labels = keyword_labeling(df.text.values, NEGATIVE_LABELS, sentiment='neg')
weak_signals = np.hstack([positive_labels, negative_labels])
weak_signals, indices = remove_indices(weak_signals)
weak_signals.shape

NameError: name 'df' is not defined

In [76]:
df = df.drop(df.index[indices])
df.reset_index(drop=True, inplace=True)
train_data = df.text.values
train_labels = np.zeros(df.shape[0])
train_labels[df.stars.values >3]=1

In [77]:
train_data = cleanTweets(df.drop(columns=['stars']))
train_labels = train_labels[train_data.index]
weak_signals = weak_signals[train_data.index]
train_data.shape, train_labels.shape

((55370, 1), (55370,))

In [None]:
train_features, train_index = get_text_vectors(train_data.values.ravel(), glove_model)
train_features.shape, train_index.shape

In [None]:
# get test data
np.random.seed(5000)
test_indexes = np.random.choice(train_index.size, 10000, replace=False)
test_labels = train_labels[test_indexes]
test_data = train_features[test_indexes]

train_data = np.delete(train_features, test_indexes, axis=0)
weak_signals = np.delete(weak_signals, test_indexes, axis=0)
train_labels = np.delete(train_labels, test_indexes)

train_data.shape,train_labels.shape,weak_signals.shape,test_labels.shape

In [None]:
# save the weak_signals signals
np.save(datapath+'weak_signals.npy', weak_signals)

# save yelp data
np.save(datapath+'data_features.npy', train_data)
np.save(datapath+'test_features.npy', test_data)

# save yelp labels
np.save(datapath+'data_labels.npy', train_labels)
np.save(datapath+'test_labels.npy', test_labels)

In [81]:
train_data.shape,train_labels.shape,weak_signals.shape,test_labels.shape

((45370, 300), (45370,), (45370, 14), (10000,))

# SST-2

In [7]:
datapath = '../datasets/sst-2/'
train_data = pd.read_csv(datapath+'sst2-train.csv')
test_data = pd.read_csv(datapath+'sst2-test.csv')
train_data.head()

Unnamed: 0,label,sentence
0,1,"A stirring, funny and finally transporting re-..."
1,0,Apparently reassembled from the cutting-room f...
2,0,They presume their audience won't sit still fo...
3,1,This is a visually stunning rumination on love...
4,1,Jonathan Parker's Bartleby should have been th...


In [8]:
positive_labels = keyword_labeling(train_data, POSITIVE_LABELS)
negative_labels = keyword_labeling(train_data, NEGATIVE_LABELS, sentiment='neg')
weak_signals = np.hstack([positive_labels, negative_labels])
weak_signals.shape

(2, 14)

In [37]:
weak_signals = remove_indices(train_data, weak_signals)
weak_signals.shape

(3998, 14)

In [38]:
train_labels = train_data.label.values
test_labels = test_data.label.values

n,m = weak_signals.shape
weak_signal_probabilities = weak_signals.T.reshape(m,n,1)

weak_signals_mask = weak_signal_probabilities >=0

from setup_model import get_validation_bounds
true_error_rates, true_precisions = get_validation_bounds(train_labels, weak_signal_probabilities, weak_signals_mask)
print("error: ", np.asarray(true_error_rates))

error:  [[0.30916844]
 [0.29194631]
 [0.26710098]
 [0.29081633]
 [0.36492375]
 [0.31952663]
 [0.19417476]
 [0.34623218]
 [0.32853026]
 [0.2513369 ]
 [0.33333333]
 [0.44829801]
 [0.15116279]
 [0.18348624]]


In [39]:
# Clean data and reset index
train_data.reset_index(drop=True, inplace=True)

# apply on train data
train_data = cleanTweets(train_data.drop(columns=['label']))
train_data = post_process_tweets(train_data)

# apply on test data
test_data = cleanTweets(test_data.drop(columns=['label']))
test_data = post_process_tweets(test_data)

print(train_data[0].shape, train_labels.shape)
print(test_data[0].shape, test_labels.shape)

(3998, 1) (3998,)
(1821, 1) (1821,)


In [18]:
train_features, train_index = get_text_vectors(train_data[0].values.ravel(), glove_model)
test_features, test_index = get_text_vectors(test_data[0].values.ravel(), glove_model)

# save sst-2 data
np.save(datapath+'data_features.npy', train_features)
np.save(datapath+'test_features.npy', test_features)

indexes = train_data[1]
indexes = indexes[train_index]
# save sst-2 labels
np.save(datapath+'data_labels.npy', train_labels[indexes])
np.save(datapath+'test_labels.npy', test_labels[test_data[1]])

# save the one-hot signals
np.save(datapath+'weak_signals.npy', weak_signals[indexes])

0       a stirring  funny and finally transporting re ...
1       they presume their audience won t sit still fo...
2       this is a visually stunning rumination on love...
3       campanella gets the tone just right    funny i...
4       a fan film that for the uninitiated plays bett...
                              ...                        
3600    painful  horrifying and oppressively tragic  t...
3601    take care is nicely performed by a quintet of ...
3602    the script covers huge  heavy topics in a blan...
3603    a seriously bad film with seriously warped log...
3604    a deliciously nonsensical comedy about a city ...
Name: sentence, Length: 3605, dtype: object

In [32]:
print("Running tests on the baselines...")
baseline_weak_labels = weak_signal_probabilities[:m, :, :]
baseline_weak_labels = np.rint(baseline_weak_labels)
mv_weak_labels = np.ones(baseline_weak_labels.shape)
mv_weak_labels[baseline_weak_labels==-1] =0
mv_weak_labels[baseline_weak_labels==0] =-1
mv_weak_labels = np.sign(np.sum(mv_weak_labels, axis=0))
mv_weak_labels[mv_weak_labels==-1] = 0

Running tests on the baselines...


In [41]:
trainLines = train_data[0].sentence
testLines = test_data[0].sentence
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape, testX.shape)

model = text_Conv1D(vocab_size, length)

model.fit(trainX, mv_weak_labels, batch_size=32, epochs=4)
 
# evaluate model on training dataset
loss, acc = model.evaluate(trainX, train_labels, verbose=0)
print('Train Accuracy: %f' % (acc*100))
 
# evaluate model on test dataset dataset
loss, acc = model.evaluate(testX,test_labels, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Max document length: 48
Vocabulary size: 9830
(3605, 48) (1821, 48)
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Train Accuracy: 69.403606
Test Accuracy: 62.108731


# TREC-6

In [132]:
def read_trecdata(filename):    
    filepath = '../datasets/trec-6/'
    labels = []
    data = []
    for line in open(filepath+filename, encoding = "ISO-8859-1"):
        temp = line.rstrip().split(' ')
        labels.append(temp[0])
        data.append(' '.join(temp[1:]))

    return data, labels

# read in train data
train_data = read_trecdata('trec_traindata.txt')
train_data = np.asarray([train_data[0], train_data[1]]).T
train_data = pd.DataFrame(train_data, columns=['Text', 'Label'])

# read in test data
test_data = read_trecdata('trec_testdata.txt')
test_data = np.asarray([test_data[0], test_data[1]]).T
test_data = pd.DataFrame(test_data, columns=['Text', 'Label'])
train_data.shape

(5452, 2)

In [133]:
def keyword_labels(dataframe, keywords):
    questions = dataframe.Text.values
    weak_signals = []
    for term in keywords:
        weak_signal = []
        if len(np.asarray(term).shape) ==1:
            word, mask = term
            for question in questions:
                label = mask if word in question.lower() else -1
                weak_signal.append(label)
        else:
            for question in questions:
                label=-1
                for tag in term:
                    word, mask = tag
                    if word in question.lower():
                        label = mask
                weak_signal.append(label)
        weak_signals.append(weak_signal)

    return weak_signals
    
def uppercase_signals(dataframe):
    questions = dataframe.Text.values
    weak_signal = []
    for question in questions:
        words = question.split(' ')
        label= -1
        if words[2].isupper() and len(words[2])>1:
            label = 2 
        weak_signal.append(label)
    return weak_signal
    
def what_signals(dataframe):
    questions = dataframe.Text.values
    weak_signal = []
    for question in questions:
        label= -1
        if 'what' in question.lower():
            label = 1
        weak_signal.append(label)
    return weak_signal          

weak_signal_1 = [[['why',0],['come from',0], ['what is `',0]], 
                 [['was the name',1],['film',1], ['do you say',1]], 
                 ['abbreviat',2], ['who',3], [['city',4],['mountain',4]], [['when was',5],['date',5]]]

weak_signal_2 = [[['how does',0], ['origin of',0], ['do to',0]],
                 [['color of',1], ['sport',1], ['fear of',1],['favorite',1]],
                 ['stand for',2],[['writer',3],['wrote',3]], ['where',4],['year',5]]

trec_terms_0 = [[['the word',0],['explain',0], ['how can',0], ['difference',0]],
                [['name a',1], ['kind of',1], ['term for',1]]]
trec_terms_1 = [[['invent',3],['president',3]], [['state',4],['originate',4], ['location',4]], ['how many',5]]

weak_signal_4 = [[['mean',0],['known for ?',0],['meaning',0]], [['made of',1],['common',1]],
                 ['full form',2],[['name ?',3],['occupation',3]], [['nationality',4],
                 ['planet',4]], [['are there ?',5],['have ?',5],['when did',5]]]

weak_signal_5 = [[['what causes',0], ['what effect',0], ['definition',0], ['how did',0]],
                 [['favorite',1],['test',1], ['animal',1]], ['acronym',2], ['company',3],
                [['capital',4],['country',4]],['how much',5]]
#['what is a']
# get keyword labelings
weak_signal_1 = np.asarray(keyword_labels(train_data, weak_signal_1)).T
weak_signal_2 = np.asarray(keyword_labels(train_data, weak_signal_2)).T
weak_signal_3 = keyword_labels(train_data, trec_terms_0)
weak_signal_3.append(uppercase_signals(train_data))
weak_signal_4 = np.asarray(keyword_labels(train_data, weak_signal_4)).T
weak_signal_5 = np.asarray(keyword_labels(train_data, weak_signal_5)).T
# print(np.asarray(uppercase_signals(train_data)).shape)
# print(np.asarray(trec_terms_2[0]).shape)

weak_signal_3 = np.vstack([weak_signal_3, keyword_labels(train_data, trec_terms_1)]).T
weak_signals = np.hstack([weak_signal_1, weak_signal_2, weak_signal_3, weak_signal_4, weak_signal_5])

In [134]:
datapath = '../datasets/trec-6/'
weak_signals, indices = remove_indices(weak_signals)
weak_signals.shape

(2969, 30)

In [135]:
# split the array into two separate signals
weak_signals = np.asarray(np.split(weak_signals, int(weak_signals.shape[1]/6), axis=1))

# create_one_hot_signals
one_hot_signals = np.zeros(weak_signals.shape)
one_hot_signals[weak_signals==-1] = -1
one_hot_signals[weak_signals!=-1] = 1

In [136]:
train_data = train_data.drop(train_data.index[indices])
print("coverage: ", np.sum(one_hot_signals != -1, axis=1))

train_labels = train_data.Label.values
test_labels = test_data.Label.values

from tensorflow.python.keras.utils import to_categorical
num_classes = weak_signals.shape[2]
train_labels = to_categorical(train_labels, num_classes)
test_labels = to_categorical(test_labels, num_classes)

weak_signals_mask = one_hot_signals >=0

from setup_model import get_validation_bounds
true_error_rates, true_precisions = get_validation_bounds(train_labels, one_hot_signals, weak_signals_mask)
print("error: ", np.asarray(true_error_rates))


coverage:  [[146 108  21 632 152  77]
 [101 116  44  52 286 114]
 [116  93  75 148 135 323]
 [109  47   1  48  16  92]
 [ 60  87   4  54 158  61]]
error:  [[0.10273973 0.37037037 0.04761905 0.03481013 0.18421053 0.07792208]
 [0.02970297 0.14655172 0.06818182 0.07692308 0.0979021  0.24561404]
 [0.04310345 0.22580645 0.45333333 0.24324324 0.33333333 0.        ]
 [0.20183486 0.34042553 0.         0.29166667 0.0625     0.04347826]
 [0.01666667 0.28735632 0.         0.18518519 0.08860759 0.06557377]]


In [137]:
# Clean data and reset index
train_data.reset_index(drop=True, inplace=True)

# apply on train data
train_data = cleanTweets(train_data.drop(columns=['Label']))
train_labels = train_labels[train_data.index]
one_hot_signals = one_hot_signals[:,train_data.index,:]
assert train_data.shape[0] == train_labels.shape[0]

# apply on test data
test_data = cleanTweets(test_data.drop(columns=['Label']))

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(2946, 1) (2946, 6)
(500, 1) (500, 6)


In [138]:
train_features, train_index = get_text_vectors(train_data.values.ravel(), glove_model)
test_features, test_index = get_text_vectors(test_data.values.ravel(), glove_model)

print(train_features.shape, train_index.shape)
print(test_features.shape, test_index.shape)

(2946, 300) (2946,)
(500, 300) (500,)


In [139]:
# save the one-hot signals
np.save(datapath+'weak_signals.npy', one_hot_signals)

# save trec-6 data
np.save(datapath+'data_features.npy', train_features)
np.save(datapath+'test_features.npy', test_features)

# save trec-6 labels
np.save(datapath+'data_labels.npy', train_labels)
np.save(datapath+'test_labels.npy', test_labels)

# IMDB Dataset

In [102]:
datapath = '../datasets/imbd/'
df = pd.read_csv(datapath+'IMDB Dataset.csv')

# apply on train data
cleaned_data = cleanTweets(df.drop(columns=['sentiment']))
indexes = cleaned_data.index.values
df.shape, indexes.size

((50000, 2), 49580)

In [103]:
n = indexes.size
# get test data
np.random.seed(50)
test_indexes = np.random.choice(indexes, int(n*0.2), replace=False)
test_labels = np.zeros(test_indexes.size)
test_labels[df.sentiment.values[test_indexes]=='positive'] = 1
test_data = df.review.values[test_indexes]

train_indexes = np.delete(indexes, [np.where(indexes == i)[0][0] for i in test_indexes])
train_labels = np.zeros(train_indexes.size)
train_labels[df.sentiment.values[train_indexes]=='positive'] = 1
train_data = df.review.values[train_indexes]

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(39664,) (39664,)
(9916,) (9916,)


In [104]:
positive_labels = keyword_labeling(train_data, [['good'],['wonderful'],['great'],['amazing'],['excellent']], sentiment='pos')
negative_labels = keyword_labeling(train_data, [['bad'],['horrible'],['sucks'],['awful'],['terrible']], sentiment='neg')
# signals = np.add(positive_labels, negative_labels)
# weak_signals = np.ones(signals.shape)
# weak_signals[signals==-1] =0 
# weak_signals[signals==0] =-1
weak_signals = np.hstack([positive_labels, negative_labels])
weak_signals, indices = remove_indices(weak_signals)
weak_signals.shape

(29187, 10)

In [105]:
# experimenting with bert embeddings
bert_abstract = """We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.
 Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.
 As a result, the pre-trained BERT representations can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. 
BERT is conceptually simple and empirically powerful. 
It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE benchmark to 80.4% (7.6% absolute improvement), MultiNLI accuracy to 86.7 (5.6% absolute improvement) and the SQuAD v1.1 question answering Test F1 to 93.2 (1.5% absolute improvement), outperforming human performance by 2.0%."""
# sentences = bert_abstract.split('\n')
# bert_embedding = BertEmbedding()
# result = bert_embedding(sentences)

# sentences = df.review.values
# result = bert_embedding(sentences)

# data_features = []
# for embedding in result:
#     data_features.append(np.mean(embedding[1],axis=0))
# # save imbd data
# np.save(datapath+'data_features.npy', np.asarray(data_features))

In [106]:
# add signals not covered to test data
test_data = np.append(test_data, train_data[indices])
test_labels = np.append(test_labels, train_labels[indices])

# delete train data not covered by weak signals
train_data = np.delete(train_data, indices, axis=0)
train_labels = np.delete(train_labels, indices)

# get data features
train_features, train_index = get_text_vectors(train_data, glove_model)
test_features, test_index = get_text_vectors(test_data, glove_model)

print(train_index.size, train_data.shape[0])
test_index.size, test_labels.size

29182 29187


(20392, 20393)

In [108]:
print("Running tests on the baselines...")
baseline_weak_labels = weak_signals.copy()
mv_weak_labels = np.ones(baseline_weak_labels.shape)
mv_weak_labels[baseline_weak_labels==-1] =0
mv_weak_labels[baseline_weak_labels==0] =-1
mv_weak_labels = np.sign(np.sum(mv_weak_labels, axis=1))
mv_weak_labels[mv_weak_labels==-1] = 0

np.mean(mv_weak_labels[train_index]==train_labels[train_index])

Running tests on the baselines...


0.7361387156466315

In [107]:
# save imbd data
np.save(datapath+'data_features.npy', train_features)
np.save(datapath+'test_features.npy', test_features)

# save imbd labels
np.save(datapath+'data_labels.npy', train_labels[train_index])
np.save(datapath+'test_labels.npy', test_labels[test_index])

# save the weak_signals
np.save(datapath+'weak_signals.npy', weak_signals[train_index])