In [None]:
import torch
import random
import numpy as np

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import re
import html

def spec_add_spaces(t: str) -> str:
    "Add spaces around / and # in `t`. \n"
    return re.sub(r"([/#\n])", r" \1 ", t)

def rm_useless_spaces(t: str) -> str:
    "Remove multiple spaces in `t`."
    return re.sub(" {2,}", " ", t)

def replace_multi_newline(t: str) -> str:
    return re.sub(r"(\n(\s)*){2,}", "\n", t)

def fix_html(x: str) -> str:
    "List of replacements from html strings in `x`."
    re1 = re.compile(r"  +")
    x = (
        x.replace("#39;", "'")
        .replace("amp;", "&")
        .replace("#146;", "'")
        .replace("nbsp;", " ")
        .replace("#36;", "$")
        .replace("\\n", "\n")
        .replace("quot;", "'")
        .replace("<br />", "\n")
        .replace('\\"', '"')
        .replace(" @.@ ", ".")
        .replace(" @-@ ", "-")
        .replace(" @,@ ", ",")
        .replace("\\", " \\ ")
    )
    return re1.sub(" ", html.unescape(x))

def clean_text(input_text):
    text = fix_html(input_text)
    text = replace_multi_newline(text)
    text = spec_add_spaces(text)
    text = rm_useless_spaces(text)
    text = text.strip()
    return text

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')
from tqdm import tqdm

df = pd.read_csv("KickstarterData.csv")
df = df.dropna(axis = 0)
df["tokenized"] = df["name"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
df = df[(df["state"] == "successful") |(df["state"] == "failed")]
df["hit"] = np.where(df["state"] == "successful", 1, 0)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [None]:
print(df[df["hit"] == 1].shape)
print(df[df["hit"] == 0].shape)

(133851, 17)
(197611, 17)


In [None]:
from collections import Counter
PADDING_VALUE = 0
UNK_VALUE     = 1

def split_train_val_test(df, props=[.8, .1, .1]):
    assert round(sum(props), 2) == 1 and len(props) >= 2
    train_df, test_df, val_df = None, None, None

    first_partition = int(df.shape[0] * props[0])
    second_partition = int(df.shape[0] * (props[0] + props[1]))

    train_df = df.iloc[:first_partition,:]
    val_df = df.iloc[first_partition:second_partition,:]
    test_df = df.iloc[second_partition:,:]
    
    return train_df, val_df, test_df

def generate_vocab_map(df, cutoff=2):
    vocab          = {"": PADDING_VALUE, "UNK": UNK_VALUE}
    reversed_vocab = None

    cnt = {}
    for tokens in df["tokenized"]:
      for word in tokens:
        if (word in cnt):
          cnt[word] = cnt[word] + 1
        else:
          cnt[word] = 1

    reversed_vocab = {}
    reversed_vocab[PADDING_VALUE] = ""
    reversed_vocab[UNK_VALUE] = "UNK"
    int_ID = 2
    for word in cnt:
      if (cnt[word] > cutoff):
        vocab[word] = int_ID
        reversed_vocab[int_ID] = word
        int_ID = int_ID + 1
    
    return vocab, reversed_vocab

In [None]:
#tst = df[["name","hit"]]

In [None]:
#tst.columns = ["text", "labels"]

In [None]:
#df = tst.copy()
#df = df.sample(frac = 0.1)

In [None]:
#train_df = df[:int(0.8*len(df))]

In [None]:
#test_df = df[int(0.8*len(df)):]

In [None]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,tokenized,hit
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,"[the, songs, of, adelaide, &, abullah]",0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,"[greeting, from, earth, :, zgac, arts, capsule...",0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,"[where, is, hank, ?]",0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,"[toshicapital, rekordz, needs, help, to, compl...",0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,"[monarch, espresso, bar]",1


In [None]:
df["cat"] = df.category.astype('category').cat.codes
df["cat_bigger"] = df.main_category.astype('category').cat.codes

In [None]:
df                         = df.sample(frac=0.2)
train_df, val_df, test_df  = split_train_val_test(df, props=[.8, .1, .1])
train_vocab, reverse_vocab = generate_vocab_map(train_df)

In [None]:
from torch.utils.data import Dataset

class WordDataset(Dataset):
    
    def __init__(self, vocab, df, max_length=50):
        self.vocab = vocab
        self.df = df
        self.max_length = max_length

        return
    
    # return the length of the dataframe instance variable
    def __len__(self):      
        df_len = self.df.shape[0]
        return df_len

    def __getitem__(self, index: int):
        self.df = self.df.reset_index(drop=True)
        for i in range(len(self.df["tokenized"][index])):
          if not(self.df["tokenized"][index][i] in self.vocab):
            self.df["tokenized"][index][i] = "UNK"

        mapped = []
        for word in self.df["tokenized"][index]:
          if(len(mapped) < self.max_length):
            mapped.append(self.vocab[word])
        mapped.append(self.df["cat"][index]) #added
        mapped.append(self.df["cat_bigger"][index])
        tokenized_word_tensor = torch.LongTensor(mapped)
        tokenized_word_tensor = tokenized_word_tensor.to(torch.device(device))

        curr_label = self.df["hit"][index]

        return tokenized_word_tensor, curr_label


In [None]:
from torch.utils.data import RandomSampler

train_dataset = WordDataset(train_vocab, train_df)
val_dataset   = WordDataset(train_vocab, val_df)
test_dataset  = WordDataset(train_vocab, test_df)
 
train_sampler = RandomSampler(train_dataset)
val_sampler   = RandomSampler(val_dataset)
test_sampler  = RandomSampler(test_dataset)

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch, padding_value=PADDING_VALUE):
    padded_tokens, y_labels = None, None

    ptlist = [] # padded_tokens list version
    ylist = []
    for tup in batch:
      ptlist.append(tup[0])
      ylist.append(tup[1])
    padded_tokens = pad_sequence(ptlist, batch_first = True, padding_value = padding_value)
    y_labels = torch.LongTensor(ylist)
    
    return padded_tokens, y_labels

In [None]:
from torch.utils.data import DataLoader
BATCH_SIZE = 16

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator  = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [None]:
import torch.nn as nn

class NBOW(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lin_sig = nn.Sequential(
            nn.Linear(embedding_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        embedded = self.embedding_layer(x)
        mean_embed = torch.mean(embedded, axis = 1)
        x_res = self.lin_sig(mean_embed)

        return x_res

In [None]:
model = NBOW(vocab_size = len(train_vocab.keys()), embedding_dim = 300).to(device)

In [None]:
from torch.optim import Adam

criterion = nn.BCELoss().to(torch.device(device))
optimizer = Adam(model.parameters(), lr = 0.003)

In [None]:
def train_loop(model, criterion, optim, iterator):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):

        optim.zero_grad()
        loss = criterion(model.forward(x).reshape(y.shape), y.float().to(torch.device(device)))
        total_loss += loss
        loss.backward()
        optim.step()

    return total_loss

def val_loop(model, iterator):
    true, pred = [], []

    for x, y in tqdm(iterator):
      for tv in y:
        if (tv == 0):
          true.append(False)
        elif (tv == 1):
          true.append(True)
      p = model.forward(x)
      for pv in p:
        if (pv <= 0.5):
          pred.append(False)
        else:
          pred.append(True)

    return true, pred

In [None]:
def accuracy(true, pred):
    acc = None

    same = 0
    for i in range(len(true)):
      if(true[i] == pred[i]):
        same += 1
    acc = same / len(true)

    return acc

def binary_f1(true, pred, selected_class=True):
    f1 = None

    tp = 0 # True Positive
    fp = 0 # False Positive. Actually negative, but predicted to positive
    fn = 0 # False Negative. Actually positive, but predicted to negative
    tn = 0 # True Negative.

    for i in range(len(true)):
      if(true[i] == True) and (pred[i] == True):
        tp = tp + 1
      elif(true[i] == True) and (pred[i] == False):
        fn = fn + 1
      elif(true[i] == False) and (pred[i] == True):
        fp = fp + 1
      elif(true[i] == False) and (pred[i] == False):
        tn = tn + 1
    
    if(selected_class == True):
      precision = tp / (tp + fp + 1)
      recall = tp / (tp + fn + 1)
    elif(selected_class == False):
      precision = tn / (tn + fn + 1)
      recall = tn / (tn + fp + 1)

    f1 = 2 * (precision*recall) / (precision + recall + 1e-10) #prevent zerodivision

    return f1

def binary_macro_f1(true, pred):
    averaged_macro_f1 = 0.5*(binary_f1(true, pred, selected_class=True) + binary_f1(true, pred, selected_class=False))
    return averaged_macro_f1

In [None]:
TOTAL_EPOCHS = 5
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, optimizer, train_iterator)
    true, pred = val_loop(model, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|██████████| 3315/3315 [06:28<00:00,  8.53it/s]
100%|██████████| 415/415 [00:09<00:00, 43.64it/s]


EPOCH: 0
TRAIN LOSS: 2065.3154296875
VAL F-1: 0.6195548392874008
VAL ACC: 0.6595263237290693


100%|██████████| 3315/3315 [06:39<00:00,  8.30it/s]
100%|██████████| 415/415 [00:09<00:00, 43.31it/s]


EPOCH: 1
TRAIN LOSS: 1921.59033203125
VAL F-1: 0.6320852542013189
VAL ACC: 0.6605822899381506


100%|██████████| 3315/3315 [06:16<00:00,  8.80it/s]
100%|██████████| 415/415 [00:09<00:00, 43.55it/s]


EPOCH: 2
TRAIN LOSS: 1848.453369140625
VAL F-1: 0.6279432041308404
VAL ACC: 0.662090813093981


100%|██████████| 3315/3315 [06:05<00:00,  9.06it/s]
100%|██████████| 415/415 [00:09<00:00, 44.20it/s]


EPOCH: 3
TRAIN LOSS: 1801.611083984375
VAL F-1: 0.6207871814713869
VAL ACC: 0.6550007542615779


  5%|▌         | 175/3315 [00:19<05:45,  9.08it/s]


KeyboardInterrupt: ignored

In [None]:
df[df["hit"] == 1].shape

(26802, 18)

In [None]:
df.shape

(33146, 17)

# LSTM model

In [None]:
class RecurrentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, \
                 num_layers=1, bidirectional=True):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = num_layers, bidirectional = bidirectional)
        self.lin_sig = nn.Sequential(
            nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = torch.transpose(embedded, 0, 1)

        outputs, (h, c) = self.lstm(embedded)

        predictions = self.lin_sig(outputs)
        predictions = torch.mean(predictions, axis = 0)
        predictions = predictions.reshape((-1,))

        return predictions
    

In [None]:
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator  = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [None]:
lstm_model = RecurrentModel(vocab_size    = len(train_vocab.keys()),
                            embedding_dim = 300,
                            hidden_dim    = 300,
                            num_layers    = 1,
                            bidirectional = True).to(device)

In [None]:
lstm_criterion = nn.BCELoss().to(torch.device(device))
lstm_optimizer = Adam(lstm_model.parameters(), lr=0.003)

In [None]:
TOTAL_EPOCHS = 30
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(lstm_model, lstm_criterion, lstm_optimizer, train_iterator)
    true, pred = val_loop(lstm_model, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|██████████| 829/829 [00:22<00:00, 37.00it/s]
100%|██████████| 104/104 [00:01<00:00, 103.97it/s]


EPOCH: 0
TRAIN LOSS: 202.73963928222656
VAL F-1: 0.5612465571150241
VAL ACC: 0.5890162945081473


100%|██████████| 829/829 [00:21<00:00, 38.58it/s]
100%|██████████| 104/104 [00:01<00:00, 103.96it/s]


EPOCH: 1
TRAIN LOSS: 176.99790954589844
VAL F-1: 0.5570932224552927
VAL ACC: 0.5757392878696439


100%|██████████| 829/829 [00:21<00:00, 38.13it/s]
100%|██████████| 104/104 [00:00<00:00, 105.33it/s]


EPOCH: 2
TRAIN LOSS: 148.0076141357422
VAL F-1: 0.5913347117337466
VAL ACC: 0.6089318044659022


100%|██████████| 829/829 [00:21<00:00, 38.83it/s]
100%|██████████| 104/104 [00:01<00:00, 103.34it/s]


EPOCH: 3
TRAIN LOSS: 116.02542114257812
VAL F-1: 0.570062569416818
VAL ACC: 0.5853952926976463


100%|██████████| 829/829 [00:21<00:00, 38.67it/s]
100%|██████████| 104/104 [00:00<00:00, 104.28it/s]


EPOCH: 4
TRAIN LOSS: 114.57599639892578
VAL F-1: 0.5692616393572696
VAL ACC: 0.5835847917923959


100%|██████████| 829/829 [00:21<00:00, 38.25it/s]
100%|██████████| 104/104 [00:01<00:00, 103.04it/s]


EPOCH: 5
TRAIN LOSS: 109.88056945800781
VAL F-1: 0.5627677338951279
VAL ACC: 0.5757392878696439


100%|██████████| 829/829 [00:21<00:00, 38.22it/s]
100%|██████████| 104/104 [00:01<00:00, 103.62it/s]


EPOCH: 6
TRAIN LOSS: 96.72039031982422
VAL F-1: 0.5564831419772354
VAL ACC: 0.5709112854556427


100%|██████████| 829/829 [00:21<00:00, 38.45it/s]
100%|██████████| 104/104 [00:00<00:00, 104.07it/s]


EPOCH: 7
TRAIN LOSS: 88.50750732421875
VAL F-1: 0.5560377820600144
VAL ACC: 0.5691007845503923


100%|██████████| 829/829 [00:21<00:00, 38.82it/s]
100%|██████████| 104/104 [00:00<00:00, 104.70it/s]


EPOCH: 8
TRAIN LOSS: 88.19039154052734
VAL F-1: 0.5724203019245465
VAL ACC: 0.5817742908871455


100%|██████████| 829/829 [00:21<00:00, 38.36it/s]
100%|██████████| 104/104 [00:00<00:00, 104.17it/s]


EPOCH: 9
TRAIN LOSS: 94.32628631591797
VAL F-1: 0.5618495186917283
VAL ACC: 0.5775497887748944


100%|██████████| 829/829 [00:22<00:00, 37.34it/s]
100%|██████████| 104/104 [00:01<00:00, 102.53it/s]


EPOCH: 10
TRAIN LOSS: 86.54547882080078
VAL F-1: 0.5536117620522354
VAL ACC: 0.5727217863608932


100%|██████████| 829/829 [00:21<00:00, 38.44it/s]
100%|██████████| 104/104 [00:00<00:00, 104.17it/s]


EPOCH: 11
TRAIN LOSS: 82.20784759521484
VAL F-1: 0.5611278392756858
VAL ACC: 0.5781532890766445


100%|██████████| 829/829 [00:21<00:00, 38.67it/s]
100%|██████████| 104/104 [00:00<00:00, 104.70it/s]


EPOCH: 12
TRAIN LOSS: 77.23480987548828
VAL F-1: 0.5519436505417814
VAL ACC: 0.5654797827398914


100%|██████████| 829/829 [00:21<00:00, 38.04it/s]
100%|██████████| 104/104 [00:00<00:00, 105.33it/s]


EPOCH: 13
TRAIN LOSS: 79.37205505371094
VAL F-1: 0.5680330307491273
VAL ACC: 0.5872057936028968


100%|██████████| 829/829 [00:21<00:00, 38.52it/s]
100%|██████████| 104/104 [00:00<00:00, 104.49it/s]


EPOCH: 14
TRAIN LOSS: 73.75206756591797
VAL F-1: 0.5802852323449981
VAL ACC: 0.592033796016898


100%|██████████| 829/829 [00:21<00:00, 38.48it/s]
100%|██████████| 104/104 [00:00<00:00, 104.59it/s]


EPOCH: 15
TRAIN LOSS: 77.7711181640625
VAL F-1: 0.570062569416818
VAL ACC: 0.5853952926976463


100%|██████████| 829/829 [00:21<00:00, 38.70it/s]
100%|██████████| 104/104 [00:00<00:00, 104.46it/s]


EPOCH: 16
TRAIN LOSS: 71.74169921875
VAL F-1: 0.5715324488644089
VAL ACC: 0.5793602896801449


100%|██████████| 829/829 [00:21<00:00, 38.71it/s]
100%|██████████| 104/104 [00:00<00:00, 105.22it/s]


EPOCH: 17
TRAIN LOSS: 68.90348815917969
VAL F-1: 0.5630341416363337
VAL ACC: 0.5908267954133977


100%|██████████| 829/829 [00:21<00:00, 38.54it/s]
100%|██████████| 104/104 [00:00<00:00, 104.80it/s]


EPOCH: 18
TRAIN LOSS: 76.56021881103516
VAL F-1: 0.564257626608835
VAL ACC: 0.5805672902836452


100%|██████████| 829/829 [00:21<00:00, 38.98it/s]
100%|██████████| 104/104 [00:00<00:00, 105.22it/s]


EPOCH: 19
TRAIN LOSS: 70.5243911743164
VAL F-1: 0.5687935252652673
VAL ACC: 0.5811707905853953


100%|██████████| 829/829 [00:21<00:00, 38.73it/s]
100%|██████████| 104/104 [00:00<00:00, 104.42it/s]


EPOCH: 20
TRAIN LOSS: 80.88124084472656
VAL F-1: 0.5747989987774398
VAL ACC: 0.5866022933011467


100%|██████████| 829/829 [00:21<00:00, 38.66it/s]
100%|██████████| 104/104 [00:01<00:00, 103.35it/s]


EPOCH: 21
TRAIN LOSS: 71.0270004272461
VAL F-1: 0.5678670580816212
VAL ACC: 0.587809293904647


100%|██████████| 829/829 [00:24<00:00, 34.01it/s]
100%|██████████| 104/104 [00:01<00:00, 101.54it/s]


EPOCH: 22
TRAIN LOSS: 60.01771545410156
VAL F-1: 0.5571721469524127
VAL ACC: 0.5847917923958962


100%|██████████| 829/829 [00:25<00:00, 32.24it/s]
100%|██████████| 104/104 [00:01<00:00, 100.07it/s]


EPOCH: 23
TRAIN LOSS: 67.66627502441406
VAL F-1: 0.5714256956875071
VAL ACC: 0.5829812914906457


100%|██████████| 829/829 [00:25<00:00, 31.93it/s]
100%|██████████| 104/104 [00:01<00:00, 95.49it/s]


EPOCH: 24
TRAIN LOSS: 70.61997985839844
VAL F-1: 0.5621489267072921
VAL ACC: 0.5835847917923959


100%|██████████| 829/829 [00:24<00:00, 33.71it/s]
100%|██████████| 104/104 [00:01<00:00, 100.56it/s]


EPOCH: 25
TRAIN LOSS: 71.51390075683594
VAL F-1: 0.5713693444191751
VAL ACC: 0.5859987929993965


100%|██████████| 829/829 [00:24<00:00, 33.96it/s]
100%|██████████| 104/104 [00:01<00:00, 100.85it/s]


EPOCH: 26
TRAIN LOSS: 69.64594268798828
VAL F-1: 0.563822548654362
VAL ACC: 0.5793602896801449


100%|██████████| 829/829 [00:24<00:00, 34.19it/s]
100%|██████████| 104/104 [00:01<00:00, 101.93it/s]


EPOCH: 27
TRAIN LOSS: 64.70840454101562
VAL F-1: 0.5647992529845645
VAL ACC: 0.5739287869643935


100%|██████████| 829/829 [00:24<00:00, 33.99it/s]
100%|██████████| 104/104 [00:01<00:00, 102.03it/s]


EPOCH: 28
TRAIN LOSS: 64.3019027709961
VAL F-1: 0.5607618424877509
VAL ACC: 0.5805672902836452


100%|██████████| 829/829 [00:24<00:00, 34.09it/s]
100%|██████████| 104/104 [00:01<00:00, 102.04it/s]

EPOCH: 29
TRAIN LOSS: 67.53488159179688
VAL F-1: 0.5501870155121271
VAL ACC: 0.5654797827398914





In [None]:
true, pred = val_loop(lstm_model, test_iterator)
print()
print(f"TEST F-1: {binary_macro_f1(true, pred)}")
print(f"TEST ACC: {accuracy(true, pred)}")

100%|██████████| 104/104 [00:01<00:00, 90.21it/s]


TEST F-1: 0.5677108202195298
TEST ACC: 0.5916767189384801





# Transformer

In [None]:
!pip install wandb
!pip install simpletransformers



In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [None]:
model_args = ClassificationArgs(num_train_epochs=5)

# Create a ClassificationModel
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args
)

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/2652 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/332 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/332 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/332 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/332 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/332 [00:00<?, ?it/s]

  0%|          | 0/663 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/83 [00:00<?, ?it/s]

In [None]:
result

{'mcc': 0.2036990814017142,
 'tp': 165,
 'tn': 236,
 'fp': 148,
 'fn': 114,
 'auroc': 0.6203843712664278,
 'auprc': 0.5543358636748549,
 'eval_loss': 1.3236708993832749}