In [2]:
import numpy as np
import pandas as pd

In [25]:
data = pd.concat([pd.read_csv('data/roasts.csv'), pd.read_csv('data/toasts.csv')])

In [26]:
data['sub'].value_counts()

ToastMe    26748
RoastMe    18121
Name: sub, dtype: int64

In [32]:
(data['Text'].apply(lambda x: len(x.split()))>100).sum()

1474

In [41]:
data = data.loc[~(data['Text'].apply(lambda x: len(x.split()))>100)]

In [18]:
len(data['Text'].apply(lambda x: x.split('. ')))

44869

In [22]:
data2['Text'] = data['Text'].apply(lambda x: x.split('. '))

In [24]:
data.explode('Text')['sub'].value_counts()

ToastMe    66744
RoastMe    23451
Name: sub, dtype: int64

In [42]:
data2 = data.copy()

In [60]:
data2['Text'] = data2['Text'].apply(lambda x: x.split('. ')).apply(lambda xy: [x.split('\n') for x in xy])

In [61]:
data2 = data2.explode('Text')

In [62]:
data2.dropna(inplace=True)

In [63]:
data2[data2['sub']=='ToastMe']

Unnamed: 0,Text,Score,sub
1,[That is one of my favorite hairstyles],2032,ToastMe
1,"[It’s simple, sophisticated and timeless]",2032,ToastMe
1,"[, , Your skin is flawless]",2032,ToastMe
1,"[, , I’m sorry that you are getting bullied]",2032,ToastMe
1,[It DOES get better after high school.],2032,ToastMe
...,...,...,...
26743,[You’re absolutely gorgeous],-16,ToastMe
26744,"[I respect you, but Covid 19 is fake]",-16,ToastMe
26745,[50 days of what? You look like you're in 6th grade.],-18,ToastMe
26746,[Is this a r/roast me or not? Wtf?],-20,ToastMe


In [64]:
df = pd.concat([data2[data2['sub']=='RoastMe'], data2[data2['sub']=='ToastMe'].sample(25000)])

In [65]:
df['sub'].value_counts()

ToastMe    25000
RoastMe    22486
Name: sub, dtype: int64

In [66]:
df['sub'] = (df['sub'] == 'RoastMe').astype('int')

In [83]:
df['Text'] = df['Text'].apply(lambda x: x[0])

In [84]:
X = df['Text']
y = df['sub']

In [69]:
from sklearn.model_selection import train_test_split

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [71]:
from tensorflow import keras as kr

In [85]:
tok = kr.preprocessing.text.Tokenizer()

In [95]:
tok.fit_on_texts(X_train)

In [96]:
vocsize = len(tok.word_index)

In [97]:
seqlen = X.apply(lambda x: len(x.split())).max()

In [98]:
seqlen

100

In [99]:
vec = kr.layers.TextVectorization(max_tokens=vocsize+1, output_sequence_length=100)

In [100]:
vec.adapt(X_train)

In [103]:
X_train = vec(X_train)
X_test = vec(X_test)

In [104]:
model = kr.models.Sequential()

model.add(kr.layers.Embedding(vocsize+1, seqlen, input_length=seqlen))
model.add(kr.layers.GlobalAveragePooling1D())
model.add(kr.layers.Dropout(0.5))
model.add(kr.layers.Dense(seqlen*2, activation=kr.activations.relu))
model.add(kr.layers.Dropout(0.5))
model.add(kr.layers.Dense(seqlen*3, activation=kr.activations.relu))
model.add(kr.layers.Dropout(0.5))
model.add(kr.layers.Dense(1, activation=kr.activations.sigmoid))

model.compile(optimizer=kr.optimizers.Adam(learning_rate=3e-4), loss=kr.losses.binary_crossentropy,
              metrics=kr.metrics.binary_accuracy)

In [105]:
early = kr.callbacks.EarlyStopping(monitor='val_binary_accuracy', mode='max', patience=20, restore_best_weights=True)

In [106]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          5098400   
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 200)               20200     
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 300)               60300     
                                                        

In [116]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200, callbacks=[early])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200


In [117]:
y_hat = model.predict(X_test)



In [276]:
print('Accuracy: {}'.format(round((y_hat.reshape(-1).round() == y_test).mean()*100, 2)))

Accuracy: 83.9


In [216]:
model.save('model2.h5', save_format='h5')