In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [36]:
train = pd.read_csv("~/UDA_pytorch/data/raw_text/schizo2k_test.csv", sep="|")

In [8]:
unsup = pd.read_csv("~/UDA_pytorch/data/imdb_data_original/imdb_unsup_train.txt")

In [37]:
from sklearn.utils import shuffle
train = shuffle(train)
train.head()

Unnamed: 0.1,Unnamed: 0,sentence,label
1318,1451,"Hell, if i'm talking to myself in my head, I e...",1
345,1143,She pushes me away because she is scared of he...,0
865,247,We started talking a lot more in the new year.,0
1319,2010,We did eventually move in.,0
1405,1310,"At the time, my parents attributed it to a bad...",1


In [6]:
print(train['ori_sent'][10]) # original sentence
print(tokenizer.tokenize(train['ori_sent'][10]))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train['ori_sent'][10])))

I've been visiting him at the hospital everyday(which is about an hour commute one way), and is with him for all of visiting hours.
['i', "'", 've', 'been', 'visiting', 'him', 'at', 'the', 'hospital', 'everyday', '(', 'which', 'is', 'about', 'an', 'hour', 'com', '##mute', 'one', 'way', ')', ',', 'and', 'is', 'with', 'him', 'for', 'all', 'of', 'visiting', 'hours', '.']
[1045, 1005, 2310, 2042, 5873, 2032, 2012, 1996, 2902, 10126, 1006, 2029, 2003, 2055, 2019, 3178, 4012, 26746, 2028, 2126, 1007, 1010, 1998, 2003, 2007, 2032, 2005, 2035, 1997, 5873, 2847, 1012]


In [16]:
tokenizer.cls_token, tokenizer.cls_token_id

('[CLS]', 101)

In [17]:
tokenizer.sep_token, tokenizer.sep_token_id

('[SEP]', 102)

In [27]:
print(len(train))
train.dropna(inplace=True)
print(len(train))

41186
41179


In [21]:
# calculating length of the longest text
# ori_input_ids\tori_input_mask\tori_input_type_ids\taug_input_ids\taug_input_mask\taug_input_type_ids

max_len = 0
for text in train['sentence']:
    if text.isnull():
        # print("nan")
        # Tokenize the text and add special tokens i.e `[CLS]` and `[SEP]`
        input_ids = tokenizer.encode(text, add_special_tokens=True)

    max_len = max(max_len, len(input_ids))


print('Max length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (787 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1307 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (744 > 512). Running this sequence through the model will result in indexing errors


Max length:  1309


In [38]:
text = train['sentence'].values
label = train['label'].values

input_ids = []
input_masks = []
input_type_ids = []

# Original Text Encode
for i in range(len(text)):
    ori_encoded = tokenizer.encode_plus(
      text[i],
      add_special_tokens=True,
      max_length=128,
      pad_to_max_length=True,
      return_token_type_ids=True,
      return_attention_mask=True,
      truncation=True
    )
    
    input_ids.append(ori_encoded['input_ids'])
    input_masks.append(ori_encoded['attention_mask'])
    input_type_ids.append(ori_encoded['token_type_ids'])

# # Augmented Text Encode
# for i in range(len(aug_text)):
#     aug_encoded = tokenizer.encode_plus(
#       aug_text[i],
#       add_special_tokens=True,
#       max_length=512,
#       pad_to_max_length=True,
#       return_token_type_ids=True,
#       return_attention_mask=True,
#       truncation=True
#     )
    
#     aug_input_ids.append(aug_encoded['input_ids'])
#     aug_input_masks.append(aug_encoded['attention_mask'])
#     aug_input_type_ids.append(aug_encoded['token_type_ids'])


print('Original text: ',text[10])
print(len(input_ids))
print(input_ids[10])
print(input_masks[10])
print(input_type_ids[10])
# print('Augmented text: ',aug_text[10])
# print(len(aug_input_ids))
# print(aug_input_ids[10])
# print(aug_input_masks[10])
# print(aug_input_type_ids[10])


Original text:  I worry that shes going to end up in the hospital all summer which is really unfortunate.
1631
[101, 1045, 4737, 2008, 2016, 2015, 2183, 2000, 2203, 2039, 1999, 1996, 2902, 2035, 2621, 2029, 2003, 2428, 15140, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [39]:
result_df = pd.DataFrame(columns = ['input_ids', 'input_mask', 'input_type_ids'])
result_df["input_ids"] = input_ids
result_df["input_mask"] = input_masks
result_df["input_type_ids"] = input_type_ids

In [40]:
labels = []
for i in range(len(label)):
    labels.append(label[i])
result_df.insert(3,'label_ids',label)
result_df["label_ids"]

0       1
1       0
2       0
3       0
4       1
       ..
1626    0
1627    0
1628    1
1629    0
1630    1
Name: label_ids, Length: 1631, dtype: int64

In [41]:
result_df

Unnamed: 0,input_ids,input_mask,input_type_ids,label_ids
0,"[101, 3109, 1010, 2065, 1045, 1005, 1049, 3331...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1,"[101, 2016, 13956, 2033, 2185, 2138, 2016, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,"[101, 2057, 2318, 3331, 1037, 2843, 2062, 1999...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,"[101, 2057, 2106, 2776, 2693, 1999, 1012, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,"[101, 2012, 1996, 2051, 1010, 2026, 3008, 7108...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
...,...,...,...,...
1626,"[101, 4066, 1997, 2066, 1037, 5351, 3168, 2828...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1627,"[101, 2065, 1045, 2064, 1005, 1056, 2831, 2000...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1628,"[101, 2016, 2052, 2298, 2012, 2477, 1998, 2360...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1629,"[101, 2026, 2783, 20992, 2562, 2033, 2013, 210...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [42]:
result_df.to_csv("data/schizo_200_train128.csv",index = False)

In [42]:
check = pd.read_csv("data/schizo_processed_test.csv", sep=',')


In [44]:
check.iloc[:, -1].value_counts()

0    316
1     92
Name: label_ids, dtype: int64

In [14]:
less.to_csv("data/schizo_processed_unlabel_small.csv")