In [1]:
import pandas as pd
import numpy as np
import os

In [5]:
# define the directory paths where we saved the tweets
train = "../data/train.csv"

In [6]:
# for another implementation see https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
from sklearn.utils import shuffle

def load_data(filename, test = False):
    
    data = pd.read_csv(filename)
    if test:
        data = data[['id','text']]
        
    else:
        data = data[['text','target']]
        
        
        
    return data

In [7]:
train_data = load_data(train)

In [8]:
# split the unsupervised data into training and validation. We will use 80 percent of the dataset 
# for training and 20 percent for validation.
from sklearn.model_selection import train_test_split

supervised_train,  supervised_test = train_test_split(train_data, test_size=0.10, random_state=42)

In [9]:
# we now prepare the training and test data. We need to divide each data set in half. One half will be tokenized
# to get sentences with the corresponding next sentence. The other half will have the same format except that 
# the sentences will come from random documents.

supervised_train = np.array_split(supervised_train, 2)
supervised_train_true = supervised_train[0]
supervised_train_random = supervised_train[1]

In [10]:
# repeat process with test data
supervised_test = np.array_split(supervised_test, 2)
supervised_test_true = supervised_test[0]
supervised_test_random = supervised_test[1]

In [11]:
# define a function that takes a list of texts, tokenizes them, returns a datafrane and has an option to shuffle
# the arrays
# the next step is to define a function that takes a list of texts, in this case a dataframe, 
# pass them to the spacy tokenizer, get the sentences, filter sentences that are too small, and then return

import spacy
import re
spacy.prefer_gpu()
import en_core_web_lg
nlp = en_core_web_lg.load()
def sentence_tokenizer(text_sequences, randomize = False):
    ''' text_sequence: A text document to be splitted into sentences'''
    
    assert isinstance(text_sequences, list)
    
    data_frame_list = []
    
    for index, text_sequence in enumerate(text_sequences):
        text_sequence = re.sub('<br />', ' ', text_sequence)
        
        list_sentences = list(nlp.pipe([text_sequence], disable= ['tagger']))
        
        # create a list of sentences and make sure they are strings, else SpaCy throws and error
        list_sentences = [list(x.sents) for x in list_sentences]
        list_sentences = [[str(y) for y in x] for x in list_sentences]

        # filter if the sentence has less than 4 words
        list_sentences =  [[y for y in x if len(y)>=4] for x in list_sentences]

        # we package all the sentences in lists of two using iter and zip two create a list of lists
        list_sentences = [list(zip(*[iter(x)]*2)) for x in list_sentences]
        list_sentences = [item for sublist in list_sentences for item in sublist]

        # conver the list into a dataframe so we can split it in half.
        data_original = pd.DataFrame(list_sentences, columns=['sentence_1','sentence_2'])
        # add a label to indicate that the second sentence corresponds to the next sentence.
        data_original['label'] = 0

        # create a column of document id
        data_original['document_id'] = index
        data_original['document_id2'] = index
        data_original['sentence_2a'] = data_original['sentence_2']
        data_frame_list.append(data_original)
    
    data = pd.concat(data_frame_list, ignore_index=True)
            
    if randomize:
        docs = data.groupby('document_id').count()
        shuffle = docs.max().max()
        data['sentence_2'] = np.roll(data['sentence_2'], shuffle)
        data['label'] = 1
              
    return data

In [13]:
# create the training data
supervised_train_true_processed = sentence_tokenizer(supervised_train_true['text'].tolist())
supervised_train_random_processed = sentence_tokenizer(supervised_train_random['text'].tolist(),
                                                        randomize = True)

In [14]:
# create the test data
supervised_test_true_processed = sentence_tokenizer(supervised_test_true['text'].tolist())
supervised_test_random_processed = sentence_tokenizer(supervised_test_random['text'].tolist(),
                                                        randomize = True)

In [15]:
# create final train data
supervised_processed_training = pd.concat([supervised_train_true_processed,
                                   supervised_train_random_processed])

In [16]:
# create final test data
supervised_processed_test = pd.concat([supervised_test_true_processed,
                                   supervised_test_random_processed])

In [22]:
supervised_processed_training[supervised_processed_training['label']==1].iloc[0:10]

Unnamed: 0,sentence_1,sentence_2,label,document_id,document_id2,sentence_2a
0,@jasoncundy05 Chelsea need to hijack Man Utd d...,I don't have enough money for all the drugs an...,1,2,2,20 mill bargain Adam driving home in Oregon US...
1,Outdoor Siren Test 2,my fall bills,1,6,6,pm :
2,: The FGCU Siren will be tested at 2pm today.,HEM-712C Automatic Blood Pressure Monitor STAN...,1,6,6,Another message will be sent when the test is ...
3,He is justifying why this quarrel would one da...,http://t.co/rqKK15uhEY,1,7,7,https://t.co/z8Ij8KTkyk
4,#Cowboys:,What a whirlwind of time it has been!,1,11,11,Wednesday's injury report: RB Lance Dunbar inj...
5,Your brain is particularly vulnerable to traum...,20 mill bargain Adam driving home in Oregon US...,1,12,12,http://t.co/KnBv2YtNWc @qz @TaraSwart @vivian_...
6,IF ANYONE WERE TO HARM THE BOYS,pm :,1,14,14,THEY WOULD GET TAKEN DOWN
7,IMMEDIATELY NOT BY SECURITY,Another message will be sent when the test is ...,1,14,14,BUT BY THE FANS REAL QUICK
8,#SanDiego #News Sinkhole Disrupts Downtown Tro...,https://t.co/z8Ij8KTkyk,1,15,15,: The incident happened
9,Wed...,Wednesday's injury report: RB Lance Dunbar inj...,1,15,15,http://t.co/RVMMuT3GvC


In [23]:
# write 
print(len(supervised_processed_training[(supervised_processed_training['sentence_2']==supervised_processed_training['sentence_2a'])&
                               supervised_processed_training['label']==1]))

# write 
print(len(supervised_processed_test[(supervised_processed_test['sentence_2']==supervised_processed_test['sentence_2a'])&
                               supervised_processed_test['label']==1]))

0
0


In [24]:
supervised_processed_training.iloc[:,0:3].to_csv('../data/fine_tune_training_dataset.csv',index=False)
supervised_processed_test.iloc[:,0:3].to_csv('../data/fine_tune_test_dataset.csv',index=False)