# Roberta

In [None]:
!pip uninstall fastai -y

Found existing installation: fastai 1.0.61
Uninstalling fastai-1.0.61:
  Successfully uninstalled fastai-1.0.61


In [None]:
!pip install fastai==1.0.61

Collecting fastai==1.0.61
  Using cached fastai-1.0.61-py3-none-any.whl.metadata (14 kB)
Using cached fastai-1.0.61-py3-none-any.whl (239 kB)
Installing collected packages: fastai
Successfully installed fastai-1.0.61


In [None]:
from fastai.text import *
from fastai.metrics import *
from transformers import RobertaTokenizer

In [None]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchnaged with roberta-large
    max_lr=1e-5,
    epochs=1,
    use_fp16=False,
    bs=4,
    max_seq_len=256,
    num_labels = 2,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
)

## Importamos los datos

In [None]:
import pandas as pd
df = pd.read_csv("test.csv")

In [None]:
content_lengths = df['Content'].str.len()
max_length = content_lengths.max()

print(f"The maximum length of the content is: {max_length}")

The maximum length of the content is: 6607


In [None]:
if config.testing: df = df[:240]
print(df.shape)

(1200, 20)


In [None]:
df.head()

Unnamed: 0,Author,Content,Date,Linked Messages,Message ID,ThreadTitle,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,annotation_id,annotator,created_at,id,lead_time,sentiment,updated_at
0,EscapeFromLife,Is getting married before CTB a good idea? Per...,"Jul 8, 2024 at 6:12 PM",,2589889,Marriage before CTB,,,,,,,,7,1,2024-10-02T03:21:09.448199Z,1,6.880.799.999.999.990,"{""choices"":[""1 wish to be death?"",""2 thoughts ...",2024-10-02T03:21:09.448199Z
1,Traveller12724,"Bro you must be trolling, I mean who in their ...","Jul 8, 2024 at 10:05 PM",2589889.0,2590106,Marriage before CTB,,,,,,,,8,1,2024-10-02T03:21:25.669006Z,2,10.03,no enough information,2024-10-02T03:21:25.669006Z
2,Myforevercharlie,I think putting someone through that is fuckin...,"Jul 8, 2024 at 10:22 PM",,2590118,Marriage before CTB,,,,,,,,9,1,2024-10-02T03:26:36.227363Z,3,304.751,no enough information,2024-10-02T03:26:36.227363Z
3,rozeske,There are better ways for recovery that don't ...,"Jul 9, 2024 at 1:50 AM",,2590218,Marriage before CTB,,,,,,,,10,1,2024-10-02T03:26:55.136421Z,4,13.271,no enough information,2024-10-02T03:26:55.136421Z
4,who doesn't matter,"Oh no, never that. My first rule has been to n...","Jul 9, 2024 at 2:19 AM",,2590231,Marriage before CTB,,,,,,,,11,1,2024-10-02T03:27:13.489896Z,5,12.969,no enough information,2024-10-02T03:27:13.489896Z


In [None]:
feat_cols = ['Content']  # Replace with the correct column name if different
label_cols = ['Sentiment']

## Configuamos el Tokenizer

In [None]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len
    def __call__(self, *args, **kwargs):
        return self
    def tokenizer(self, t:str) -> List[str]:
        """Adds Roberta bos and eos tokens and limits the maximum sequence length"""
        return [config.start_tok] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [config.end_tok]

In [None]:
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len),
                             pre_rules=[], post_rules=[])



In [None]:
# create fastai vocabulary for roberta
path = Path()
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)

fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

In [None]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), RobertaNumericalizeProcessor(vocab=vocab)]

## Configurando el DataBunch

In [None]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False,
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

In [None]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

hay que dividir la parte de sentimientos

In [None]:
def determine_sentiment(annotation_data):
    if pd.isna(annotation_data):
        return "no enough information"  # Default for NaN values
    elif "no enough information" in annotation_data.lower():
        return "no enough information"  # Handle direct mention

    try:
        choices = json.loads(annotation_data)['choices']
    except (json.JSONDecodeError, TypeError):
        return "no enough information"  # Handle non-JSON cases

    # Check for the presence of each sentiment category
    if "high risk" in choices:
        return "high risk"
    elif "medium risk" in choices:
        return "medium risk"
    elif "low risk" in choices:
        return "low risk"
    elif "method" in choices:
        return "method"
    elif "supportive" in choices:
        return "supportive"
    elif "suicide supportive" in choices:
        return "suicide supportive"

    return "no enough information"

# Apply the function to create/update the sentiment column
df['Sentiment'] = df['sentiment'].apply(determine_sentiment)

# Now, you can check the DataFrame
print(df[['Sentiment', 'sentiment']].head())

               Sentiment                                          sentiment
0              high risk  {"choices":["1 wish to be death?","2 thoughts ...
1  no enough information                              no enough information
2  no enough information                              no enough information
3  no enough information                              no enough information
4  no enough information                              no enough information


In [None]:
df.head()

Unnamed: 0,Author,Content,Date,Linked Messages,Message ID,ThreadTitle,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 6,...,Unnamed: 8,Unnamed: 9,annotation_id,annotator,created_at,id,lead_time,sentiment,updated_at,Sentiment
0,EscapeFromLife,Is getting married before CTB a good idea? Per...,"Jul 8, 2024 at 6:12 PM",,2589889,Marriage before CTB,,,,,...,,,7,1,2024-10-02T03:21:09.448199Z,1,6.880.799.999.999.990,"{""choices"":[""1 wish to be death?"",""2 thoughts ...",2024-10-02T03:21:09.448199Z,high risk
1,Traveller12724,"Bro you must be trolling, I mean who in their ...","Jul 8, 2024 at 10:05 PM",2589889.0,2590106,Marriage before CTB,,,,,...,,,8,1,2024-10-02T03:21:25.669006Z,2,10.03,no enough information,2024-10-02T03:21:25.669006Z,no enough information
2,Myforevercharlie,I think putting someone through that is fuckin...,"Jul 8, 2024 at 10:22 PM",,2590118,Marriage before CTB,,,,,...,,,9,1,2024-10-02T03:26:36.227363Z,3,304.751,no enough information,2024-10-02T03:26:36.227363Z,no enough information
3,rozeske,There are better ways for recovery that don't ...,"Jul 9, 2024 at 1:50 AM",,2590218,Marriage before CTB,,,,,...,,,10,1,2024-10-02T03:26:55.136421Z,4,13.271,no enough information,2024-10-02T03:26:55.136421Z,no enough information
4,who doesn't matter,"Oh no, never that. My first rule has been to n...","Jul 9, 2024 at 2:19 AM",,2590231,Marriage before CTB,,,,,...,,,11,1,2024-10-02T03:27:13.489896Z,5,12.969,no enough information,2024-10-02T03:27:13.489896Z,no enough information


In [None]:
# Check types and lengths of entries in feat_cols
for col in feat_cols:
    print(f"Column: {col}")
    print(df[col].apply(lambda x: (type(x), len(x) if isinstance(x, (list, str)) else None)).head(10))


Column: Content
0    (<class 'str'>, 592)
1    (<class 'str'>, 149)
2    (<class 'str'>, 109)
3     (<class 'str'>, 91)
4    (<class 'str'>, 206)
5    (<class 'str'>, 448)
6    (<class 'str'>, 155)
7     (<class 'str'>, 92)
8    (<class 'str'>, 254)
9    (<class 'str'>, 528)
Name: Content, dtype: object


In [None]:
# Convert lists to strings (if applicable)
for col in feat_cols:
    df[col] = df[col].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))


In [None]:
# Replace empty strings with a placeholder
df[feat_cols] = df[feat_cols].replace('', 'No content')


In [None]:
# Check types and values in label_cols
for col in label_cols:
    print(f"Label Column: {col}")
    print(df[col].unique())

Label Column: Sentiment
['high risk' 'no enough information' 'medium risk' 'low risk' 'method']


In [None]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

# Check if feat_cols and label_cols are correctly defined
if not all(col in df.columns for col in feat_cols):
    raise ValueError(f"Feature columns {feat_cols} are not in the DataFrame.")
if not all(col in df.columns for col in label_cols):
    raise ValueError(f"Label columns {label_cols} are not in the DataFrame.")

# Create the databunch
data = (RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor)
        .split_by_rand_pct(seed=config.seed)
        .label_from_df(cols=label_cols, label_cls=CategoryList)
        .databunch(bs=config.bs, pad_first=False, pad_idx=0)
       )

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (960,) + inhomogeneous part.

In [None]:
# Check for NaN values
print(df[label_cols].isnull().sum())

# Optionally, you can fill NaN with empty strings or drop them
df[label_cols] = df[label_cols].fillna("")  # Fill NaNs with empty strings
# or
df = df.dropna(subset=label_cols)  # Drop rows with NaNs in feat_cols

Sentiment    0
dtype: int64
