In [1]:
%load_ext autoreload
%autoreload 2

In [31]:
import os
from datetime import datetime
import re
import sys
sys.path.append('../src/')

import fasttext
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=3)
import pandas as pd
import seaborn as sns
sns.set_theme()
from transformers import BertTokenizer, BertModel

from dataset_processor import DataProcessor

In [3]:
BASE_DATA_PATH = "/data/ddmg/personalizedmentalhealth/reddit_project/data"

# Load Data and Apply Tokenization

In [6]:
data_paths = [os.path.join(BASE_DATA_PATH, "4_all_data.csv")]

In [7]:
df_list = []
for data_path in data_paths:
    df = pd.read_csv(data_path)
    df.drop(columns="Unnamed: 0", inplace=True)
    df_list.append(df)
df = pd.concat(df_list)

In [8]:
df

Unnamed: 0,id,author,data_split,subreddit,text,created_utc
0,2rrm7n,Fearlessfight,train,family,My Mum is acting really weird. This is weird b...,1.420742e+09
1,2rgc78,AstroKate,train,family,Awkward Situations w/ My Dad? Hi Everyone! I'm...,1.420501e+09
2,2rdr14,bigguytx,train,family,Just found out my father isn't my father (afte...,1.420442e+09
3,2rbrfq,staceyastacey,train,family,Our mother left her family A public letter to ...,1.420402e+09
4,2r6afm,itlikesmenot,train,family,"Sister's shit of a ""man"" So my sister has 2 ki...",1.420260e+09
...,...,...,...,...,...,...
1379454,knek0k,90dayfinancee,train,personalfinance,"HELP! ""Large"" savings account, pay down debt o...",1.609374e+09
1379455,kneht6,callmeqws,train,personalfinance,Help me understand my statement balance? [http...,1.609374e+09
1379456,kneh32,DogtorPepper,train,personalfinance,HSA Account Fees I have an old HSA from old em...,1.609374e+09
1379457,knedb8,apenguin7,train,personalfinance,Am I (son) responsible for my deceased father'...,1.609373e+09


In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [36]:
example_batch = list(df['text'].values[0:2])
example_batch

["My Mum is acting really weird. This is weird but I wondered if I could have some help. My mum has been acting strange for a while now. She just doesn't smile anymore and has a blank facial expression. She does suffer with insomnia and she has been very and I mean very stressed. We do have to move house soon and we are lacking money but her behaviour is still really peculiar for her. She doesn't listen to things I say and she has been judging my opinions and also laughing when it's not funny and not laughing when it is funny. It's really worrying me, as I'm only 14. But what do I do? It's so horrible",
 'Awkward Situations w/ My Dad? Hi Everyone! I\'m Jay 13 years old, I have a problem with my dad. I always feel awkward when we\'ll watch tv, or see magazines, etc. For example if "Dancing w/ The Stars" is on the woman will have skimpy clothing on and, I feel my face getting hot, and I get embarrassed. Or what I have a bigger problem with is when we\'re at walmart and going through the 

In [11]:
tokenizer.model_max_length

512

In [37]:
encoded = tokenizer(example_batch)
encoded

{'input_ids': [[101, 2026, 12954, 2003, 3772, 2428, 6881, 1012, 2023, 2003, 6881, 2021, 1045, 4999, 2065, 1045, 2071, 2031, 2070, 2393, 1012, 2026, 12954, 2038, 2042, 3772, 4326, 2005, 1037, 2096, 2085, 1012, 2016, 2074, 2987, 1005, 1056, 2868, 4902, 1998, 2038, 1037, 8744, 13268, 3670, 1012, 2016, 2515, 9015, 2007, 16021, 5358, 6200, 1998, 2016, 2038, 2042, 2200, 1998, 1045, 2812, 2200, 13233, 1012, 2057, 2079, 2031, 2000, 2693, 2160, 2574, 1998, 2057, 2024, 11158, 2769, 2021, 2014, 9164, 2003, 2145, 2428, 14099, 2005, 2014, 1012, 2016, 2987, 1005, 1056, 4952, 2000, 2477, 1045, 2360, 1998, 2016, 2038, 2042, 13325, 2026, 10740, 1998, 2036, 5870, 2043, 2009, 1005, 1055, 2025, 6057, 1998, 2025, 5870, 2043, 2009, 2003, 6057, 1012, 2009, 1005, 1055, 2428, 15366, 2033, 1010, 2004, 1045, 1005, 1049, 2069, 2403, 1012, 2021, 2054, 2079, 1045, 2079, 1029, 2009, 1005, 1055, 2061, 9202, 102], [101, 9596, 8146, 1059, 1013, 2026, 3611, 1029, 7632, 3071, 999, 1045, 1005, 1049, 6108, 2410, 2086, 2214

In [41]:
len(encoded['input_ids'][0])

145

In [28]:
len(encoded['attention_mask'])

145

In [40]:
tokenizer.convert_ids_to_tokens(encoded['input_ids'][1])

['[CLS]',
 'awkward',
 'situations',
 'w',
 '/',
 'my',
 'dad',
 '?',
 'hi',
 'everyone',
 '!',
 'i',
 "'",
 'm',
 'jay',
 '13',
 'years',
 'old',
 ',',
 'i',
 'have',
 'a',
 'problem',
 'with',
 'my',
 'dad',
 '.',
 'i',
 'always',
 'feel',
 'awkward',
 'when',
 'we',
 "'",
 'll',
 'watch',
 'tv',
 ',',
 'or',
 'see',
 'magazines',
 ',',
 'etc',
 '.',
 'for',
 'example',
 'if',
 '"',
 'dancing',
 'w',
 '/',
 'the',
 'stars',
 '"',
 'is',
 'on',
 'the',
 'woman',
 'will',
 'have',
 'ski',
 '##mp',
 '##y',
 'clothing',
 'on',
 'and',
 ',',
 'i',
 'feel',
 'my',
 'face',
 'getting',
 'hot',
 ',',
 'and',
 'i',
 'get',
 'embarrassed',
 '.',
 'or',
 'what',
 'i',
 'have',
 'a',
 'bigger',
 'problem',
 'with',
 'is',
 'when',
 'we',
 "'",
 're',
 'at',
 'wal',
 '##mart',
 'and',
 'going',
 'through',
 'the',
 'check',
 'out',
 'line',
 ',',
 'there',
 "'",
 's',
 'usually',
 '"',
 'maxim',
 '"',
 'magazines',
 'there',
 'which',
 'really',
 'makes',
 'me',
 'embarrassed',
 '/',
 'awkward',


In [46]:
encoded = tokenizer(example_batch, padding='longest', return_tensors='pt')
encoded

{'input_ids': tensor([[  101,  2026, 12954,  2003,  3772,  2428,  6881,  1012,  2023,  2003,
          6881,  2021,  1045,  4999,  2065,  1045,  2071,  2031,  2070,  2393,
          1012,  2026, 12954,  2038,  2042,  3772,  4326,  2005,  1037,  2096,
          2085,  1012,  2016,  2074,  2987,  1005,  1056,  2868,  4902,  1998,
          2038,  1037,  8744, 13268,  3670,  1012,  2016,  2515,  9015,  2007,
         16021,  5358,  6200,  1998,  2016,  2038,  2042,  2200,  1998,  1045,
          2812,  2200, 13233,  1012,  2057,  2079,  2031,  2000,  2693,  2160,
          2574,  1998,  2057,  2024, 11158,  2769,  2021,  2014,  9164,  2003,
          2145,  2428, 14099,  2005,  2014,  1012,  2016,  2987,  1005,  1056,
          4952,  2000,  2477,  1045,  2360,  1998,  2016,  2038,  2042, 13325,
          2026, 10740,  1998,  2036,  5870,  2043,  2009,  1005,  1055,  2025,
          6057,  1998,  2025,  5870,  2043,  2009,  2003,  6057,  1012,  2009,
          1005,  1055,  2428, 15366,  

In [47]:
encoded['input_ids'].shape

torch.Size([2, 382])

In [49]:
encoded['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0

In [32]:
model = BertModel.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
output = model(**encoded)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 3.7899e-01, -1.4712e-02,  2.7151e-01,  ..., -3.3954e-02,
           4.5202e-01, -8.3304e-02],
         [ 2.1995e-01,  5.1819e-02, -2.1898e-01,  ...,  3.0398e-01,
           7.8992e-01,  3.4174e-01],
         [ 6.9810e-01,  2.8701e-04,  4.5882e-01,  ..., -3.7120e-01,
           3.5956e-02,  1.5919e-01],
         ...,
         [ 1.4884e-01, -7.5779e-01,  9.9337e-01,  ...,  1.2291e-01,
           1.3551e-01, -4.3027e-01],
         [ 3.0356e-01,  2.1034e-01,  1.6035e-01,  ..., -3.2684e-02,
           1.0484e-01,  2.6159e-01],
         [ 4.1300e-01,  2.2131e-01,  2.3085e-01,  ...,  1.1983e-01,
           9.8253e-02,  7.9160e-02]],

        [[-3.5623e-02, -4.4165e-01,  4.0529e-01,  ..., -4.3983e-01,
           6.4437e-01,  2.9590e-01],
         [-1.4378e+00,  7.0333e-01,  3.4683e-01,  ..., -5.8992e-01,
           3.5098e-01, -1.7803e-01],
         [-1.3149e+00,  1.2391e+00,  6.2656e-02,  ..., -7.4146e-01,
          -2.

In [51]:
idx = 0
for text in df['text'].to_numpy():
    encoded = tokenizer(text)
    if len(encoded['input_ids']) > 512:
        print(idx)
        break
    idx += 1

Token indices sequence length is longer than the specified maximum sequence length for this model (715 > 512). Running this sequence through the model will result in indexing errors


2


In [57]:
text_ex = df['text'].to_numpy()[2]
text_ex

'Just found out my father isn\'t my father (after 28 years) So, yesterday I was helping my mother with her email problem on her laptop when I ran across an email from one of her long-time friends who she recently hasn\'t been seeing eye-to-eye with. I normally don\'t bother reading people\'s email and this is especially the case with my mother. However, when archiving all of her old emails I ran across an emailed that was titled with my name. \n\nI started to read and it was an angry email directed at mother. It started off talking about how cold and calculating she was. It then went on to say that she couldn\'t believe how my mother could lie to her to her own flesh and blood. **What I read next completely paralyzed me. It said that she had been lying to me my entire life about my real father who died and how my real father was mentally ill. It also mentioned that I was mentally ill and how it probably ran in the family.** I was devastated. You mean, the guy i\'ve been calling dad my 

In [62]:
encoded = tokenizer(text_ex)
len(encoded['input_ids'])

715

In [59]:
tokenizer.convert_ids_to_tokens(encoded['input_ids'])

['[CLS]',
 'just',
 'found',
 'out',
 'my',
 'father',
 'isn',
 "'",
 't',
 'my',
 'father',
 '(',
 'after',
 '28',
 'years',
 ')',
 'so',
 ',',
 'yesterday',
 'i',
 'was',
 'helping',
 'my',
 'mother',
 'with',
 'her',
 'email',
 'problem',
 'on',
 'her',
 'laptop',
 'when',
 'i',
 'ran',
 'across',
 'an',
 'email',
 'from',
 'one',
 'of',
 'her',
 'long',
 '-',
 'time',
 'friends',
 'who',
 'she',
 'recently',
 'hasn',
 "'",
 't',
 'been',
 'seeing',
 'eye',
 '-',
 'to',
 '-',
 'eye',
 'with',
 '.',
 'i',
 'normally',
 'don',
 "'",
 't',
 'bother',
 'reading',
 'people',
 "'",
 's',
 'email',
 'and',
 'this',
 'is',
 'especially',
 'the',
 'case',
 'with',
 'my',
 'mother',
 '.',
 'however',
 ',',
 'when',
 'arch',
 '##iving',
 'all',
 'of',
 'her',
 'old',
 'emails',
 'i',
 'ran',
 'across',
 'an',
 'email',
 '##ed',
 'that',
 'was',
 'titled',
 'with',
 'my',
 'name',
 '.',
 'i',
 'started',
 'to',
 'read',
 'and',
 'it',
 'was',
 'an',
 'angry',
 'email',
 'directed',
 'at',
 'mot

In [66]:
t_encoded = tokenizer(text_ex, truncation=True)
len(t_encoded['input_ids'])

512

In [67]:
tokenizer.convert_ids_to_tokens(t_encoded['input_ids'])

['[CLS]',
 'just',
 'found',
 'out',
 'my',
 'father',
 'isn',
 "'",
 't',
 'my',
 'father',
 '(',
 'after',
 '28',
 'years',
 ')',
 'so',
 ',',
 'yesterday',
 'i',
 'was',
 'helping',
 'my',
 'mother',
 'with',
 'her',
 'email',
 'problem',
 'on',
 'her',
 'laptop',
 'when',
 'i',
 'ran',
 'across',
 'an',
 'email',
 'from',
 'one',
 'of',
 'her',
 'long',
 '-',
 'time',
 'friends',
 'who',
 'she',
 'recently',
 'hasn',
 "'",
 't',
 'been',
 'seeing',
 'eye',
 '-',
 'to',
 '-',
 'eye',
 'with',
 '.',
 'i',
 'normally',
 'don',
 "'",
 't',
 'bother',
 'reading',
 'people',
 "'",
 's',
 'email',
 'and',
 'this',
 'is',
 'especially',
 'the',
 'case',
 'with',
 'my',
 'mother',
 '.',
 'however',
 ',',
 'when',
 'arch',
 '##iving',
 'all',
 'of',
 'her',
 'old',
 'emails',
 'i',
 'ran',
 'across',
 'an',
 'email',
 '##ed',
 'that',
 'was',
 'titled',
 'with',
 'my',
 'name',
 '.',
 'i',
 'started',
 'to',
 'read',
 'and',
 'it',
 'was',
 'an',
 'angry',
 'email',
 'directed',
 'at',
 'mot

In [70]:
import torch
torch.cuda.is_available()

False

In [71]:
full_encoded = tokenizer(list(df['text'][:10]), truncation=True)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.