In [None]:
#!pip install transformers

# Libraries

In [3]:
import numpy as np
import pandas as pd
import os
import warnings
from tqdm import tqdm
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaForQuestionAnswering, RobertaConfig, RobertaTokenizer

warnings.filterwarnings('ignore')
TRAIN = False

In [None]:
# test out basics of roberta  
# https://huggingface.co/transformers/model_doc/roberta.html#robertaforquestionanswering
Test =False
if Test:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    text = "Jim is happy, but not me" 
    sent_text = 'negative ' + "Jim is happy, but not me"
    selected_text =  "happy, but not me"

    text = ["I win"]
    selected_text =  "I win"

    inputs = tokenizer(text, return_tensors='pt', pad_to_max_length=True, truncation=True, max_length=10)
    start_positions = torch.tensor([1])
    end_positions = torch.tensor([2])

    model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
    outputs = model(**inputs)
    # outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
    loss = outputs.loss
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    print(inputs, '\n', outputs)
# outputs
# tokenizer.encode('negative'+"Jim is happy, but not me", return_tensors='pt')
# inputs
# outputs.start_logits.squeeze(0)

# Seed

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

# Data Loader

### Add  token_len, start_idx, end_idx to training data


In [None]:
if TRAIN:
    cleaned = True
    if cleaned:
        train_df = pd.read_csv('input/tweet-sentiment-extraction/clean_train.csv')
    else:
        train_df = pd.read_csv('input/tweet-sentiment-extraction/train.csv')
    train_df['text'] = train_df['text'].astype(str)
    train_df['selected_text'] = train_df['selected_text'].astype(str)

In [None]:
if TRAIN:
    if 'token_len' not in train_df:  
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        def token_length(row):
            texto = " " + " ".join(row.text.lower().split())
            text = tokenizer(texto)['input_ids']
            return len(text)
        train_df['token_len'] = train_df.apply(token_length, axis=1)
        print('max train token length: ', train_df.token_len.max())

In [None]:
if TRAIN:
    if 'start_idx' not in train_df:   
        # token level index 
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        def find_idx(row, p_token=False):
            # tokenizer should not use padding since actual length is used
            texto = " " + " ".join(row.text.lower().split())
            sel_to = " " + " ".join(row.selected_text.lower().split())
            text = tokenizer(texto)['input_ids']
            sel_t = tokenizer(sel_to)['input_ids']
            if p_token:
                print(text, '\n', sel_t)
            # for very long sublist finding 
            # see https://stackoverflow.com/questions/7100242/python-numpy-first-occurrence-of-subarray
            # we will just use rolling windows for tweet data
            i = 1
            while i<=len(text)-len(sel_t)+1:
                if text[i] == sel_t[1]:
                    # print(i, text[i:i+len(sel_t)-2], sel_t[1:len(sel_t)-1])
                    if text[i:i+len(sel_t)-2] == sel_t[1:len(sel_t)-1]:
                        start_idx = i
                        end_idx = i+len(sel_t)-3
                        return start_idx, end_idx

                i+=1
            # Error in selected_text, this should be corrected using character level index
            # idea 1: remove incomplete words in selected_text
            # idea 2: complete the words
            # idea 3: remove these rows
            return 0, 0 

        train_df['start_idx'] = train_df.apply(lambda x: find_idx(x)[0], axis=1)
        train_df['end_idx'] = train_df.apply(lambda x: find_idx(x)[1], axis=1)

    #=============================================================
    # character level index
    # def find_start(row):
    #     return row.text.find(row.selected_text)
    # def find_end(row):
    #     return  row.start_idx + len(row.selected_text)
    # if 'start_idx' not in train_df:
    #     train_df['start_idx'] = train_df.apply(lambda row: row.text.find(row.selected_text), axis=1)  # along column
    #     train_df['end_idx'] = train_df.apply(find_end, axis=1)



### **Error in training labels** 
?? **convert_tokens_to_string** might solve the subwords error ??

In [None]:
Test =False
if Test:
    error_train_df = train_df[train_df.start_idx ==0]
    error_train_df
# error_train_df.to_csv('input/tweet-sentiment-extraction/error_train.csv')
    print(error_train_df.iloc[0].text, '\n', error_train_df.iloc[0].selected_text)
    find_idx(error_train_df.iloc[0], p_token=True)
    print('**----- selected text wrong -----**')


    print(error_train_df.iloc[2593].text, '\n', error_train_df.iloc[2593].selected_text)
    find_idx(error_train_df.iloc[2593], p_token=True)
    print('**----- selected text missing a parenthesis -----**')

- The error data is droped because
* the error is not a structured and there is no easy fix
* drop 2594/27481 <10% is not hurting too much

In [None]:
if TRAIN:
    train_df_clean = train_df[train_df.start_idx !=0]
    train_df_clean.reset_index(drop=True, inplace=True)
    del train_df

### Torch data class

In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        # use internet
        # self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")       
        # no internet
        self.tokenizer = RobertaTokenizer(vocab_file='../input/roberta-base/vocab.json', 
                                          merges_file='../input/roberta-base/merges.txt')

    def __getitem__(self, index):
        row = self.df.iloc[index]
        # tokenizer should not use padding since actual length is used
        text_o =  " " + " ".join((row.text + ' '+ row.sentiment).lower().split())
        data = self.tokenizer(text_o, 
                         return_tensors='pt', 
                         pad_to_max_length=True, 
                         truncation=True, 
                         max_length=self.max_len)
        # 
        # since return_tensors='pt' will produce batched result but
        # dataloaders only feed in one row at a time. so we should remove 
        # batch dimension In order to have auto batching working properly
        for key in data.keys():
            data[key]= data[key].squeeze()

        # if we do not require  return_tensors='pt', tokenizer produce list; we need
        #for key in data.keys():
        #    data[key] = torch.tensor(data[key]) 
        
        if self.labeled:
            data['token_len'] = row.token_len
            data['start_idx'] = row.start_idx
            data['end_idx'] = row.end_idx

            """
            compute start_idx and end_idx is time consuming, so we move it 
            to operate after loading df, and saving as columns in df
            """
            # ## old code
            # sel_o = " " + " ".join(row.selected_text.lower().split())
            # sel_token = self.tokenizer(sel_o, 
            #             truncation=True, 
            #             max_length=self.max_len)['input_ids']
            # print(sel_o, '\n', sel_token)
            # data['start_idx'], data['end_idx'] = self.find_idx(text_token, sel_token)
           
        return data
    # def find_idx(self, text, sel_t):
    #   # for very long sublist finding 
    #     # see https://stackoverflow.com/questions/7100242/python-numpy-first-occurrence-of-subarray
    #     # we will just use rolling windows for tweet data
    #     i = 1
    #     while i<=len(text)-len(sel_t)+1:
    #         if text[i] == sel_t[1]:
    #             if text[i:i+len(sel_t)-2]== sel_t[1:len(sel_t)-1]:
    #                 start_idx = i
    #                 print(i)
    #                 end_idx = i+len(sel_t)-3
    #                 return start_idx, end_idx
    #         i+=1
    
    def __len__(self):
        return len(self.df)

#==============================================================  
# auto batching is tricky when data are in different format, we could write a
# function to replace default collate_fn
def customer_batch(data):
    pass

#==============================================================    
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        #collate_fn= customer_batch,
        shuffle=True, 
        num_workers=1,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        #collate_fn= customer_batch,
        shuffle=False, 
        num_workers=1)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

#==============================================================    

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        #collate_fn= customer_batch,
        shuffle=False, 
        num_workers=1)    
    return loader

In [None]:
"""Test the dataloaders
"""
Test = False
i=1
if Test:
    sss = StratifiedShuffleSplit(n_splits=1, train_size=777*32, random_state=seed)
    for train_idx, val_idx in sss.split(train_df_clean, train_df_clean.sentiment):
        # print(train_idx, val_idx)
        
        data_loader = get_train_val_loaders(train_df_clean, train_idx, val_idx, batch_size=2)['train']
        for data in data_loader:
            if i < 2:
                #print(data)
                i += 1
                
            # decode convert token ids to text
            tokenizer = RobertaTokenizer.from_pretrained("roberta-base")  
            print( tokenizer.decode(data['input_ids'][0][1:5]) )
            break


In [None]:
24887/32

# Model

In [None]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        # use internet
        # self.roberta = RobertaForQuestionAnswering.from_pretrained('roberta-base')
        # no internet 
        config = RobertaConfig.from_pretrained(
            '../input/roberta-base/config.json')    
        self.roberta = RobertaForQuestionAnswering.from_pretrained(
            '../input/roberta-base/pytorch_model.bin', config=config)

        # self.dropout = nn.Dropout(0.2)
        # self.fc = nn.Linear(config.hidden_size, 2)
        # nn.init.normal_(self.fc.weight, std=0.02)
        # nn.init.normal_(self.fc.bias, 0)

    def forward(self, inputs):
        outputs = self.roberta(**inputs)
         
        # x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
        # x = torch.mean(x, 0)
        # x = self.dropout(x)
        # x = self.fc(x)
        # start_logits, end_logits = x.split(1, dim=-1)
        # start_logits = start_logits.squeeze(-1)
        # end_logits = end_logits.squeeze(-1)
                
        return outputs.start_logits, outputs.end_logits


# Loss Function

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    # start_logits/end_logits has dimension: batch * text_length
    # start_positions/end_positions : batch * 1
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

def loss_fn1(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    # start_logits/end_logits has dimension: batch * text_length
    # start_positions/end_positions : batch * 1
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)   
    length =  (end_positions - start_positions).abs().float()
    # when length is large, we do not really care so much on every position, take average 
    total_loss = (start_loss + end_loss)/length # + 0.1* length
    return total_loss

- Jaccard distance and Binary Cross Entropy are similar

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt


def jaccard_distance_loss(y_true, y_pred, smooth=1):
    """
    Jaccard = (|X & Y|)/ (|X|+ |Y| - |X & Y|)
            = sum(|A*B|)/(sum(|A|)+sum(|B|)-sum(|A*B|))
    Jaccard_smoothed = 
    
    Ref: https://en.wikipedia.org/wiki/Jaccard_index
    
    """
    intersection= (y_true * y_pred).abs().sum(dim=1)
    union = torch.sum(y_true.abs() + y_pred.abs(), dim=1) -intersection
    jac = (intersection + smooth) / (union + smooth)
    return (1 - jac) * smooth


# Test and plot
y_pred = torch.from_numpy(np.array([np.arange(-10, 10+0.1, 0.1)]).T)
y_true = torch.from_numpy(np.zeros(y_pred.shape))
name='jaccard_distance_loss'
loss = jaccard_distance_loss(y_true,y_pred).numpy()
plt.title(name)
plt.plot(y_pred.numpy(),loss)
plt.xlabel('abs prediction error')
plt.ylabel('loss')
plt.show()
    
name='binary cross entropy'
loss = torch.nn.functional.binary_cross_entropy(
       y_true,y_pred, reduction='none').mean(-1).numpy()
plt.title(name)
plt.plot(y_pred.numpy(),loss)
plt.xlabel('abs prediction error')
plt.ylabel('loss')
plt.show()
    
# Test
print("TYPE                 |Almost_right |half right |extra selected |all_wrong")
y_true = torch.from_numpy(np.array([[0,0,1,0],[0,0,1,0],[0,0,1,0],[0,0,1.,0.]]))
y_pred = torch.from_numpy(np.array([[0,0,0.9,0],[0,0,0.1,0],[1,1,1,1],[1,1,0,1]]))

y_true = torch.from_numpy(np.array([[0,0,1],[0,0,1],[0,0,1],[0,0,1.]]))
y_pred = torch.from_numpy(np.array([[0,0,0.9],[0,0,0.1],[1,1,1],[1,1,0]]))
r1 = jaccard_distance_loss(
    y_true,
    y_pred,).numpy()
print('jaccard_distance_loss',r1)
print('jaccard_distance_loss scaled',r1/r1.max())
assert r1[0]<r1[1]
assert r1[1]<r1[2]

r2 = torch.nn.functional.binary_cross_entropy(
    y_true,
    y_pred,
    reduction='none').mean(-1).numpy()
print('binary_crossentropy',r2)
print('binary_crossentropy_scaled',r2/r2.max())
assert r2[0]<r2[1]
assert r2[1]<r2[2]


# Evaluation Function

- If start_idx pred > end_idx pred: we will take the entire text as selected_text

In [None]:
def jaccard_score(text_token_nopadding_len, start_idx, end_idx, start_pred, end_pred):
    # start_logits, end_logits are logits output of model
    # start_pred = np.argmax(start_logits)
    # end_pred = np.argmax(end_logits)
    text_len = text_token_nopadding_len
    if start_pred > end_pred: # taking the whole text as selected_text
        start_pred = 1
        end_pred = text_len-1

    if end_idx < start_pred or end_pred < start_idx: # intersection = 0
        return 0
    else: 
        union = max(end_pred, end_idx) - min(start_pred, start_idx)+1
        intersection = min(end_pred, end_idx) - max(start_pred, start_idx)+1
        return intersection/union
Test =False
if Test:
    jaccard_score(5,1,1,4,2) # 0.25
    # jaccard_score(96,1,1,4,2) # 0.0105

    start_logits = torch.tensor([[0,0,0,0,1]]).float() 
    start_idx =torch.tensor([1])
    #start_pred = torch.cat((start_pred, torch.zeros(1,91)),axis=1)

    ce = torch.nn.CrossEntropyLoss()
    ce(start_logits, start_idx)
# when len=5, loss = 1.9048; when len=96, loss = 4.5718


- **Note**: 
1. jaccard_score is sensitive to total length, CrossEntropy is not sensitive.
2. our jaccard_score function is a fast and close approximation of the true Jaccard score (character level) used in this competetion. There would be a bit more computation if we want character level Jaccard.

# Training Function

In [None]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, batch_size, filename):
    if torch.cuda.is_available():
        model.cuda()

    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            with tqdm(dataloaders_dict[phase], unit="batch") as tepoch:
                tepoch.set_description(f"Epoch {epoch+1}")
                for data in tepoch:
                    # reserve token_len, start_idx, end_idx for later loss computation
                    token_len = data['token_len'].numpy()
                    start_idx = data['start_idx']
                    end_idx = data['end_idx']
                    for key in ['token_len', 'start_idx', 'end_idx']:
                        data.pop(key)
                    
                    # put data in GPU
                    if torch.cuda.is_available():
                        start_idx = start_idx.cuda()
                        end_idx = end_idx.cuda()
                        for key in data.keys():
                            data[key]= data[key].cuda()

                    # training 
                    optimizer.zero_grad()
                    with torch.set_grad_enabled(phase == 'train'):

                        start_logits, end_logits = model.forward(data)

                        loss = criterion(start_logits, end_logits, start_idx, end_idx)
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                        epoch_loss += loss.item() 
                        
                        # Jaccard score
                        #torch.argmax(torch.tensor([[0,0,0,0,1],[0,0,0,1.5,1]]), dim=1)
                        start_pred = torch.argmax(start_logits, dim=1).cpu().detach().numpy()
                        end_pred = torch.argmax(end_logits, dim=1).cpu().detach().numpy()
                        
                        start_idx = start_idx.cpu().detach().numpy()
                        end_idx = end_idx.cpu().detach().numpy()

                        for i in range(batch_size):  # or range(token_len.shape[0])                      
                            jaccard = jaccard_score(token_len[i], start_idx[i], end_idx[i], start_pred[i], end_pred[i])
                            epoch_jaccard += jaccard
                    tepoch.set_postfix(loss=loss.item()/batch_size)
                    
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
    
    torch.save(model.state_dict(), filename)

# Training

In [None]:
num_epochs = 5
batch_size = 32
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)


In [None]:
torch.cuda.empty_cache() 

In [None]:
if TRAIN:
    %%time
    # Each fold takes 7* epochs = 35 mins,
    split_fold = False
    if split_fold:
        for fold, (train_idx, val_idx) in enumerate(skf.split(train_df_clean, train_df_clean.sentiment), start=1): 

            print(f'Fold: {fold}')

            model = TweetModel()
            optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
            criterion = loss_fn    
            dataloaders_dict = get_train_val_loaders(train_df_clean, train_idx, val_idx, batch_size)
            train_model(
                model, 
                dataloaders_dict,
                criterion, 
                optimizer, 
                num_epochs,
                batch_size,
                f'roberta_fold{fold}.pth')

- We see a increase in validation loss after 2 epochs. So we only train 2 epochs on the full data

### run on the full training data

In [None]:
torch.cuda.empty_cache() 

In [None]:
if TRAIN:
    %%time
    num_epochs = 2
    batch_size = 32
    split_fold = False
    if not split_fold:
        sss = StratifiedShuffleSplit(n_splits=1, train_size=776*32, random_state=seed)
        for train_idx, val_idx in sss.split(train_df_clean, train_df_clean.sentiment):
            dataloaders_dict = get_train_val_loaders(train_df_clean, train_idx, val_idx, batch_size)

        model = TweetModel()
        optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
        criterion = loss_fn    

        train_model(
            model, 
            dataloaders_dict,
            criterion, 
            optimizer, 
            num_epochs,
            batch_size,
            f'roberta_whole.pth')

In [None]:
if TRAIN:
    del model
    # train one more epoch with lower learning rate
    sss = StratifiedShuffleSplit(n_splits=1, train_size=775*32, random_state=seed)
    for train_idx, val_idx in sss.split(train_df_clean, train_df_clean.sentiment):
        dataloaders_dict = get_train_val_loaders(train_df_clean, train_idx, val_idx, batch_size)
    num_epochs = 1
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load('../input/tweetextraction/roberta_whole.pth'))


    optimizer = optim.AdamW(model.parameters(), lr=3e-6, betas=(0.9, 0.999)) 

    train_model(
        model, 
        dataloaders_dict,
        criterion, 
        optimizer, 
        num_epochs,
        batch_size,
        f'roberta_whole2.pth')

# Inference

In [None]:
# For Inference only
# https://huggingface.co/transformers/internal/tokenization_utils.html

test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)


model = TweetModel()
if torch.cuda.is_available():
    model.cuda()
    model.load_state_dict(torch.load('../input/tweetextraction/roberta_whole1.pth'))
else:
    model.load_state_dict(torch.load('../input/tweetextraction/roberta_whole.pth', map_location=torch.device('cpu') ))
model.eval()

In [None]:
%%time
predictions = []
# decode convert token ids to text
tokenizer = RobertaTokenizer(vocab_file='../input/roberta-base/vocab.json', 
                             merges_file='../input/roberta-base/merges.txt') 
with tqdm(test_loader, unit="batch") as tepoch:
    tepoch.set_description("Test:")
    for data in tepoch:
        
        # put data in GPU
        if torch.cuda.is_available():
            for key in data.keys():
                data[key]= data[key].cuda()

        # testing 
        with torch.no_grad():

            start_logits, end_logits = model(data)
            start_pred = torch.argmax(start_logits, dim=1)
            end_pred = torch.argmax(end_logits, dim=1)
            
            for i in range(start_pred.shape[0]): # number of rows in a batch
                if start_pred[i] > end_pred[i]:
                    predictions.append(' ') # those will be replace by text after we build the dataframe
                else:
                    sel_t = tokenizer.decode(data['input_ids'][i][start_pred[i]:end_pred[i]+1])
                    predictions.append(sel_t)


# Submission

In [None]:
sub_df = test_df # [['textID','text','sentiment']]
sub_df['selected_text'] = predictions
def rep_text(row):
    rst= row.selected_text
    if (rst is ' ') or (len(rst)> len(row.text)):
        return row.text
    if len(rst.split())==1:
        rst = rst.replace('!!!!', '!')
        rst = rst.replace('..', '.')
        rst = rst.replace('...', '.')
        return rst
    if row.sentiment == 'neutral':
        rst = row.text
    return rst

sub_df['selected_text'] = sub_df.apply(rep_text, axis=1)
sub_df.drop(['text','sentiment'], axis=1, inplace=True)
sub_df.to_csv('submission.csv', index=False)
sub_df

In [4]:
train_df= pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [16]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,url
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0
2,088c60f138,my boss is bullying me...,bullying me,negative,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,0
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,0
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,0
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,0


In [14]:
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)

In [20]:
import re
regex = re.compile("(?P<url>https?://[^\s]+)")
# r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    
def has_url(row):
    a = regex.search(row.text) is not None
    b = regex.search(row.selected_text) is not None
    return a+b
      
train_df['url'] = train_df.apply(has_url, axis=1)
    

In [25]:
# only text has url
train_df[(train_df['url']==1) & (train_df['sentiment']=='neutral')]

Unnamed: 0,textID,text,selected_text,sentiment,url
35,4f5267ad70,"Thats it, its the end. Tears for Fears vs Eric...","Thats it, its the end. Tears for Fears",neutral,1
57,6086b1f016,will be back later. http://plurk.com/p/rp3k7,will be back later.,neutral,1
289,2dd0cddb70,I want it BACK NOW!: http://bit.ly/PP1WZ,I want it BACK NOW!:,neutral,1
404,ef200df027,The little wormy from labyrinth sadly passed a...,The little wormy from labyrinth sadly passed a...,neutral,1
410,41f4eb92b9,[-O] i wish the birthday massacre would come t...,i wish the birthday massacre would come to aus...,neutral,1
...,...,...,...,...,...
26911,a62400eeb0,"Trending you say? Well, there`s TweetCannon ...","Trending you say? Well, there`s",neutral,1
27036,d2e9eadb57,Ustream replay for Control Your Rankings. htt...,Ustream replay for Control Your Rankings.,neutral,1
27114,556d273874,http://tinyurl.com/cyonct vote for Rob,vote for Rob,neutral,1
27140,c28465b668,Google ... show me apples ... I only want to s...,Google ... show me apples ... I only want to s...,neutral,1


In [26]:
# both text and selected_text has url
train_df[(train_df['url']==2) & (train_df['sentiment']=='neutral')]

Unnamed: 0,textID,text,selected_text,sentiment,url
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,2
50,a3ae670885,Then you should check out http://twittersucks...,Then you should check out http://twittersucks....,neutral,2
215,2dcd8766a8,yellow for ? http://blip.fm/~5z05g,yellow for ? http://blip.fm/~5z05g,neutral,2
258,251aa82551,Here are 4 FREE twitter tools will get you fol...,Here are 4 FREE twitter tools will get you fol...,neutral,2
341,c20cb786c4,"It`s a Peter & Gordon morning -> And I, go to...","It`s a Peter & Gordon morning -> And I, go to...",neutral,2
...,...,...,...,...,...
27168,cc07ca95d1,http://twitpic.com/2xjoc - can you paint me,http://twitpic.com/2xjoc - can you paint me,neutral,2
27170,b6dd20c81f,if you hit a car .. u should leave a note http...,if you hit a car .. u should leave a note http...,neutral,2
27231,d46f9acd72,just finished watching my copy of the Twilight...,just finished watching my copy of the Twilight...,neutral,2
27282,b5a9c83566,from last Montday`s award ceremony http://twi...,from last Montday`s award ceremony http://twi...,neutral,2
