In [None]:
import pandas as pd

In [None]:

df = pd.read_csv('data/reddit_200k_train.csv', encoding='ISO-8859-1') 
df

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.body.duplicated(keep=False).sum()

In [None]:

df.loc[df.body.duplicated(keep=False)].head(30)

In [None]:

df['parent_id.x'].duplicated(keep=False).sum()

In [None]:

df['id'].duplicated(keep=False).sum()

In [None]:
df.REMOVED.value_counts()

In [None]:

pd.set_option('max_colwidth', 400)
df.head(30)

In [None]:

df.corr()

In [None]:
df.isna().sum()

In [None]:
df_clean = df.drop(['Unnamed: 0', 'score.x', 'parent_id.x', 'id', 'created_utc.x', 'retrieved_on'], axis=1)
df_clean.head()

In [None]:
df_clean.body = df_clean.body.str.lower()
df_clean.head()

In [None]:
from nltk.tokenize import RegexpTokenizer

pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(pattern)
df_clean['tokenized'] = df_clean['body'].apply(tokenizer.tokenize)
df_clean.head()

In [None]:
from nltk import FreqDist
import matplotlib.pyplot as plt


def visualize_top_10(freq_dist, title):

    
    top_10 = list(zip(*freq_dist.most_common(10)))
    tokens = top_10[0]
    counts = top_10[1]

    
    fig, ax = plt.subplots()
    ax.bar(tokens, counts)

    
    ax.set_title(title)
    ax.set_ylabel("Count")
    #ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis="x", rotation=90)
    


visualize_top_10(FreqDist(df_clean['tokenized'].explode()), 'word freq, train set')

In [None]:
def clean_input(input_df):
    df = input_df.copy() 

    
    df.drop(['Unnamed: 0', 'score.x', 'parent_id.x', 'id', 'created_utc.x', 'retrieved_on'], axis=1, inplace=True)
    
    
    df.body = df.body.str.lower()
    
    
    pattern = r"(?u)\b\w\w+\b"
    tokenizer = RegexpTokenizer(pattern)
    df['tokenized'] = df['body'].apply(tokenizer.tokenize)
    
    
    df.drop(
        df.loc[df['body'].str.contains('your submission has been removed for the following')].index,
        inplace=True
    )
    return df

train = clean_input(pd.read_csv('data/reddit_200k_train.csv', encoding='ISO-8859-1'))
test = clean_input(pd.read_csv('data/reddit_200k_test.csv', encoding='ISO-8859-1'))

In [None]:
train.head()

In [None]:
test.head()

In [None]:


df.loc[df['body'].str.contains('your submission has been removed for the following')]

In [None]:


X_train = train['body']
y_train = train['REMOVED']
X_test = test['body']
y_test = test['REMOVED']

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=10)

X_train_vectorized = tfidf.fit_transform(X_train)

In [None]:
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names_out())

In [None]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
baseline_model = MultinomialNB()


baseline_cv = cross_val_score(baseline_model, X_train_vectorized, y_train)
baseline_cv

In [None]:

y_train.value_counts(normalize=True)

In [None]:


from nltk.corpus import stopwords

stopwords_list = stopwords.words('english')

def remove_stopwords(token_list): 

    result = []
    for token in token_list:
        if token not in stopwords_list:
            result.append(token)
            
    return result



x_wo_stop = df_clean['tokenized'].apply(remove_stopwords)

visualize_top_10(FreqDist(x_wo_stop.explode()), 'top 10 w/o stop')

In [None]:


tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stopwords_list
)

X_train_vectorized = tfidf.fit_transform(X_train)

base_remove_stop = MultinomialNB()

remove_stop_cv = cross_val_score(base_remove_stop, X_train_vectorized, y_train)
remove_stop_cv

In [None]:
print(baseline_cv.mean())
print(remove_stop_cv.mean())

In [None]:


from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language="english")

def stem_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [stemmer.stem(token) for token in tokens]

stemmed_stopwords = [stemmer.stem(word) for word in stopwords_list]

tfidf = TfidfVectorizer(
    max_features=10, 
    stop_words=stemmed_stopwords, 
    tokenizer=stem_and_tokenize)

X_train_vectorized = tfidf.fit_transform(X_train)

In [None]:

base_stem = MultinomialNB()

stemmed_cv = cross_val_score(base_stem, X_train_vectorized, y_train)
stemmed_cv

In [None]:
print(baseline_cv.mean())
print(remove_stop_cv.mean())
print(stemmed_cv.mean())

In [None]:


from nltk.tokenize import sent_tokenize
train['sent_token'] = train['body'].apply(lambda x: len(sent_tokenize(x)))

In [None]:
X_train_sent = train.drop('REMOVED', axis=1)
X_train_sent

In [None]:

tfidf = TfidfVectorizer(
    max_features=500,
    stop_words=stemmed_stopwords,
    tokenizer=stem_and_tokenize
)

X_train_vectorized = tfidf.fit_transform(X_train_sent["body"])

X_train_vectorized_df = pd.DataFrame(X_train_vectorized.toarray(), columns=tfidf.get_feature_names())


In [None]:
X_train_vectorized_df['sent_token'] = train['sent_token']
X_train_vectorized_df

In [None]:
X_train_vectorized_df.loc[X_train_vectorized_df['sent_token'].isna(), 'sent_token'] = 0

In [None]:
final_NB = MultinomialNB()

final_NB.fit(X_train_vectorized_df, y_train)
final_NB.score(X_train_vectorized_df, y_train)

In [None]:

train.loc[train['body'].str.contains('has been removed because')]

In [None]:
import gensim
import pandas as pd
pd.set_option('max_colwidth', 400)
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
import numpy as np
#nltk.download('wordnet')

In [None]:


stopwords_list = stopwords.words('english')

def remove_stopwords(input_list):
    output_list = []
    for word in input_list:
        if word not in stopwords_list:
            output_list.append(word)
    return output_list

def clean_input(input_df):
    df = input_df.copy() 

    
    df.drop(['Unnamed: 0', 'score.x', 'parent_id.x', 'id', 'created_utc.x', 'retrieved_on'], axis=1, inplace=True)
    
    
    df.body = df.body.str.lower()
    
    
    df.drop(
        df.loc[df['body'].str.contains('your submission has been removed for the following')].index,
        inplace=True
    )
    
    df.drop(
        df.loc[df['body'].str.contains('has been removed because')].index,
        inplace=True
    )
    
    
    pattern = r"(?u)\b\w\w+\b"
    tokenizer = RegexpTokenizer(pattern)
    df['tokenized'] = df['body'].apply(tokenizer.tokenize)
    
    
    df['no_stop'] = df['tokenized'].apply(remove_stopwords)
    
    
    lemma = nltk.stem.wordnet.WordNetLemmatizer()
    df['lemma'] = df['no_stop'].apply(lambda x: [lemma.lemmatize(y) for y in x])
    
    
    df['clean_string'] = df['lemma'].apply(lambda x: ' '.join(x))

    return df

train = clean_input(pd.read_csv('data/reddit_200k_train.csv', encoding='ISO-8859-1'))
test = clean_input(pd.read_csv('data/reddit_200k_test.csv', encoding='ISO-8859-1'))

In [None]:
train

In [None]:
#train.to_csv('data/train_clean.csv')
#test.to_csv('data/test_clean.csv')

In [None]:

w2v_train = train['clean_string'].copy()

In [None]:

w2v_train_list = []
for text in w2v_train:
    list_words = text.split()
    list_grams = [" ".join(list_words[i:i+1]) 
                 for i in range(0, len(list_words), 1)]
    w2v_train_list.append(list_grams)
    
    


#w2v_train_bi = []
#for sent in w2v_train_list:
#    sent_bi = list(zip(sent, sent[1:]))
#    #print(sent_bi)
#    output_sent = []
#    for gram in sent_bi:
#        output_gram = ' '.join(gram)
#        output_sent.append(output_gram)
#    w2v_train_bi.append(output_sent)
    
#check_idx = 300

#print(w2v_train_list[check_idx])
#print(w2v_train_bi[check_idx])



In [None]:
print(len(w2v_train_list))
#print(len(w2v_train_bi))


#w2v_train_final = w2v_train_list.copy()
#w2v_train_final.extend(w2v_train_bi)
#print(len(w2v_train_list))
#print(len(w2v_train_bi))
#print(len(w2v_train_final))

#print(w2v_train_final[-8492])
#print(w2v_train_bi[-8492])
print(w2v_train_list[50:100])

In [None]:

#x = w2v_train_list[145]
#print(x)
#y = list(zip(x, x[1:], x[2:]))
#print(y)
#output_sent = []
#for gram in y:
#    output = ' '.join(gram)
#    print(output)
#    output_sent.append(output)
#print(output_sent)

#print('before', w2v_train_list[1])


#get_bigrams = gensim.models.phrases.Phrases(w2v_train_list, 
#                                            delimiter=" ".encode(), min_count=5, threshold=10)
#bigrams_detector = gensim.models.phrases.Phraser(get_bigrams)

#get_trigrams = gensim.models.phrases.Phrases(bigrams_detector[w2v_train_list], 
#                                                  delimiter=" ".encode(), min_count=5, threshold=10)
#trigrams_detector = gensim.models.phrases.Phraser(get_trigrams)




#print('after', w2v_train_list[1])




In [None]:

#nlp = gensim.models.word2vec.Word2Vec(w2v_train_list, size=300,   
#            window=8, min_count=1, sg=1, iter=30)

#nlp.save('data/w2v2')

#nlp = gensim.models.word2vec.Word2Vec.load('data/w2v1') 
nlp = gensim.models.word2vec.Word2Vec.load('data/w2v2') 

In [None]:
word = "data"
nlp[word].shape


In [None]:

import tensorflow.keras.preprocessing

tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(lower=True, split=' ', 
                                                          oov_token="NaN", 
                                                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(w2v_train_list)
dic_vocabulary = tokenizer.word_index


sequence_list= tokenizer.texts_to_sequences(w2v_train_list)


X_train = tensorflow.keras.preprocessing.sequence.pad_sequences(sequence_list, 
                    maxlen=15, padding="post", truncating="post")

In [None]:
import seaborn as sns
sns.heatmap(X_train==0, vmin=0, vmax=1, cbar=False)
plt.show()

In [None]:

w2v_test = test['clean_string'].copy()

w2v_test_list = []
for text in w2v_test:
    list_words = text.split()
    list_grams = [" ".join(list_words[i:i+1])
                 for i in range(0, len(list_words), 1)]
    w2v_test_list.append(list_grams)
    
#w2v_test_list = list(bigrams_detector[w2v_test_list])
#w2v_test_list = list(trigrams_detector[w2v_test_list])

sequence_list = tokenizer.texts_to_sequences(w2v_test_list)
X_test = tensorflow.keras.preprocessing.sequence.pad_sequences(sequence_list, maxlen=15,
             padding="post", truncating="post")

In [None]:
sns.heatmap(X_test==0, vmin=0, vmax=1, cbar=False)
plt.show()

In [None]:

i = 0


len_txt = len(train["clean_string"].iloc[i].split())
print("from: ", train["clean_string"].iloc[i], "| len:", len_txt)


len_tokens = len(X_train[i])
print("to: ", X_train[i], "| len:", len(X_train[i]))


print("check: ", train["clean_string"].iloc[i].split()[0], 
      " -- idx in vocabulary -->", 
      dic_vocabulary[train["clean_string"].iloc[i].split()[0]])

print("vocabulary: ", dict(list(dic_vocabulary.items())[0:5]), "... (padding element, 0)")

In [None]:


embeddings = np.zeros((len(dic_vocabulary)+1, 300))

for word,idx in dic_vocabulary.items():
    
    try:
        embeddings[idx] =  nlp[word]
    except:
        pass

In [None]:

word = "active"
print("dic[word]:", dic_vocabulary[word], "|idx")
print("embeddings[idx]:", embeddings[dic_vocabulary[word]].shape, 
      "|vector")

In [None]:


import tensorflow.keras.models as models
import tensorflow.keras.layers as layers


def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

x_in = layers.Input(shape=(15,))


x = layers.Embedding(input_dim=embeddings.shape[0],  
                     output_dim=embeddings.shape[1], 
                     weights=[embeddings],
                     input_length=15, trainable=False)(x_in)


#x = attention_layer(x, neurons=15)


x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2, 
                         return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(units=15, dropout=0.2))(x)


x = layers.Dense(64, activation='relu')(x)
y_out = layers.Dense(2, activation='softmax')(x)


model = models.Model(x_in, y_out)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:

y_train = train['REMOVED'].values
y_test = test['REMOVED'].values
dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_train))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
y_train = np.array([inverse_dic[y] for y in y_train])

In [None]:
y_train

In [None]:

#y_train = train['REMOVED'].values
#y_test = test['REMOVED'].values

#dic_y_mapping = {n:label for n,label in 
#                 enumerate(np.unique(y_train))}
#inverse_dic = {v:k for k,v in dic_y_mapping.items()}
#y_train = np.array([inverse_dic[y] for y in y_train])


training = model.fit(x=X_train, y=y_train, batch_size=256, 
                     epochs=10, shuffle=True, verbose=1, 
                     validation_split=0.3)


model.save('data/lstm2')



In [None]:

metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
ax[0].set(title="Training")
ax11 = ax[0].twinx()
ax[0].plot(training.history['loss'], color='black')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss', color='black')
for metric in metrics:
    ax11.plot(training.history[metric], label=metric)
ax11.set_ylabel("Score", color='steelblue')
ax11.legend()
ax[1].set(title="Validation")
ax22 = ax[1].twinx()
ax[1].plot(training.history['val_loss'], color='black')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss', color='black')
for metric in metrics:
     ax22.plot(training.history['val_'+metric], label=metric)
ax22.set_ylabel("Score", color="steelblue")
plt.show()

In [None]:
model = models.load_model('data/lstm2')
model.summary()

In [None]:
from lime import lime_text
explainer = lime_text.LimeTextExplainer(verbose=True)
explainer

In [None]:
X_test

In [None]:
np.array([X_test.tolist()[0]])

In [None]:
#select_idx = 200
print(model.predict(np.array([X_test.tolist()[11]])))



In [None]:
y_test

In [None]:
test.head(20)

In [None]:

explanation = explainer.explain_instance(np.array([X_test.tolist()[11]]), classifier_fn=model.predict)

In [None]:
import gensim
import pandas as pd
pd.set_option('max_colwidth', 400)
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
import numpy as np
from lime import lime_text
import tensorflow.keras.preprocessing
import seaborn as sns
import tensorflow.keras.models as models
import tensorflow.keras.layers as layers

In [None]:
train = pd.read_csv('data/train_clean.csv')
test = pd.read_csv('data/test_clean.csv')
train

In [None]:


train.drop(train.loc[train['clean_string'].isna()].index, inplace=True)
print(train['clean_string'].isna().sum())

test.drop(test.loc[test['clean_string'].isna()].index, inplace=True)
print(test['clean_string'].isna().sum())

In [None]:
y_train = train['REMOVED'].astype(int).to_numpy()
y_test = test['REMOVED'].astype(int).to_numpy()
y_train

In [None]:


train_list = train['clean_string'].to_list()
#train_list[50:100]
train_list_tokenized = []
for string in train_list:
    #print(string)
    train_list_tokenized.append(string.split(' '))
    
#print(type(train_list_tokenized[1]))
print(train_list_tokenized[0:10])
#print(len(train_list_tokenized))


test_list = test['clean_string'].to_list()
test_list_tokenized = []
for string in test_list:
    test_list_tokenized.append(string.split(' '))
    



tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(lower=False,
                                                          split=' ', 
                                                          oov_token="NaN", 
                                                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(train_list_tokenized)
vocabulary = tokenizer.word_index
#print(vocabulary)



train_sequence = tokenizer.texts_to_sequences(train_list_tokenized)

test_sequence = tokenizer.texts_to_sequences(test_list_tokenized)



train_seq_padded = tensorflow.keras.preprocessing.sequence.pad_sequences(train_sequence, 
                                                                         maxlen=15, 
                                                                         padding="post", 
                                                                         truncating="post")

test_seq_padded = tensorflow.keras.preprocessing.sequence.pad_sequences(test_sequence,
                                                                        maxlen=15, 
                                                                        padding='post', 
                                                                        truncating='post')

In [None]:


to_check = train_seq_padded
#to_check = test_seq_padded

sns.heatmap(to_check==0, vmin=0, vmax=1, cbar=False)
plt.show()

In [None]:


w2v = gensim.models.word2vec.Word2Vec.load('data/w2v2') 


embed_matrix = np.zeros((len(vocabulary)+1, 300))

for word, index in vocabulary.items():
    try:
        embed_matrix[index] =  w2v[word]
    except:
        pass

In [None]:

word = "movie"
print("word index in vocab:", vocabulary[word])
print("embed matrix at index:\n", embed_matrix[vocabulary[word]])

In [None]:


def lime_predict(text):
    
    
    final_output = []
    predict_input = []
    for text_variant in text:

        
        input_list_tokenized = [text_variant.split(' ')]
        #input_list_tokenized = text_variant.split(' ')
        print(input_list_tokenized)

        
        input_sequence = tokenizer.texts_to_sequences(input_list_tokenized)

        
        input_seq_padded = tensorflow.keras.preprocessing.sequence.pad_sequences(input_sequence,
                                                                                 maxlen=15, 
                                                                                 padding='post', 
                                                                                 truncating='post')
        #print('proper input:', input_seq_padded)

        #result = model.predict(input_seq_padded)
        #print(result[0].tolist())
        #final_output.append(result[0].tolist())
        
        
        predict_input.append(input_seq_padded[0].tolist())
    
    #print(predict_input)
    
    predict_output = lstm_tuned.predict(predict_input)
    print(predict_output)
    
    #output_np = np.array(final_output) 
    #print(output_np)
    
    return predict_output

In [None]:

model = models.load_model('data/lstm2')
model.summary()

In [None]:



pred_test_string = train['clean_string'][97]
#print(pred_test_string)

#print(lime_predict(pred_test_string))

In [None]:

explainer = lime_text.LimeTextExplainer()
explanation = explainer.explain_instance(pred_test_string, lime_predict, num_samples=20)

In [None]:
explanation.show_in_notebook()

In [None]:

train_model = False

if not train_model:
    lstm_tuned = models.load_model('data/lstm4')

else:
    lstm_tuned = models.Sequential()
    lstm_tuned.add(layers.Input(shape=(15,)))
    lstm_tuned.add(layers.Embedding(input_dim=embed_matrix.shape[0],
                                    output_dim=embed_matrix.shape[1],
                                    weights=[embed_matrix],
                                    input_length=15,
                                    trainable=False))
    
    lstm_tuned.add(layers.Bidirectional(layers.LSTM(units=15,
                                                    dropout=0.2,
                                                    return_sequences=True)))
    
    lstm_tuned.add(layers.Bidirectional(layers.LSTM(units=15,
                                                    dropout=0.2)))
    
    lstm_tuned.add(layers.Dense(64, activation='relu'))
    lstm_tuned.add(layers.Dense(32, activation='relu'))
    
    lstm_tuned.add(layers.Dense(2, activation='softmax'))
    
    lstm_tuned.compile(loss='sparse_categorical_crossentropy',
                       optimizer='sgd',
                       metrics=['accuracy'])
    
    lstm_tuned.summary()
    
    results_lstm_tuned = lstm_tuned.fit(train_seq_padded, y_train,
                                        epochs=16,
                                        batch_size=128,
                                        validation_split=0.3,
                                        verbose=1)
    lstm_tuned.save('data/lstm6')

In [None]:
lstm_tuned.evaluate(test_seq_padded, y_test)

In [None]:
lstm_tuned.evaluate(test_seq_padded, y_test)

In [None]:

nlp = dict()

with open('data/glove.6B/glove.6B.300d.txt', 'r', encoding='utf8') as f:
    w = 0
    for line in f.readlines():
        line = line.split(' ')
        
        try:
            nlp[line[0]] = np.array(line[1:], dtype=float) 
            
        except:
            print('failed on line', w)
            continue
            
print(nlp['candidate'])

In [None]:
embed_matrix_glove = np.zeros((len(vocabulary)+1, 300))

for word, index in vocabulary.items():
    try:
        embed_matrix[index] =  nlp[word]
    except:
        pass

In [None]:

train_model = True

if not train_model:
    lstm_glove = models.load_model('data/lstm_glove1')

else:
    lstm_glove = models.Sequential()
    lstm_glove.add(layers.Input(shape=(15,)))
    lstm_glove.add(layers.Embedding(input_dim=embed_matrix.shape[0],
                                    output_dim=embed_matrix.shape[1],
                                    weights=[embed_matrix],
                                    input_length=15,
                                    trainable=False))
    
    lstm_glove.add(layers.Bidirectional(layers.LSTM(units=15,
                                                    dropout=0.2,
                                                    return_sequences=True)))
    
    lstm_glove.add(layers.Bidirectional(layers.LSTM(units=15,
                                                    dropout=0.2)))
    
    lstm_glove.add(layers.Dense(64, activation='relu'))
    lstm_glove.add(layers.Dense(32, activation='relu'))
    
    lstm_glove.add(layers.Dense(2, activation='softmax'))
    
    lstm_glove.compile(loss='sparse_categorical_crossentropy',
                       optimizer='sgd',
                       metrics=['accuracy'])
    
    lstm_glove.summary()
    
    results_lstm_glove = lstm_glove.fit(train_seq_padded, y_train,
                                        epochs=16,
                                        batch_size=128,
                                        validation_split=0.15,
                                        verbose=1)
    lstm_glove.save('data/lstm_glove1')

In [None]:
lstm_glove.evaluate(test_seq_padded, y_test)

In [None]:

print(len(train))
print(len(test))

data_cleaned_all = pd.concat([train, test])

print(len(data_cleaned_all))
data_cleaned_all.reset_index(inplace=True)
data_cleaned_all.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)
data_cleaned_all

In [None]:

print(data_cleaned_all.isna().sum())
(data_cleaned_all.loc[data_cleaned_all['clean_string'].duplicated(keep=False)])

In [None]:

#data_cleaned_all.to_csv('data/cleaned_all.csv')

In [None]:
from transformers import AutoTokenizer
tf_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
clean_all = pd.read_csv('data/cleaned_all.csv')
#something = tf_tokenizer(clean_all['clean_string'].to_list(), truncation=True)
#print(type(something))
clean_all.drop(['Unnamed: 0', 'body', 'lemma', 'tokenized', 'no_stop'], axis=1, inplace=True)
clean_all['label'] = clean_all['REMOVED'].astype(int)
clean_all

In [None]:
from datasets import Dataset
ds_test = Dataset.from_pandas(clean_all)
ds_test

In [None]:
#print(type(tf_input_ids))
#print(tf_input_ids)
#print(something[3])

def tf_preprocess(input_dataset):
    return tf_tokenizer(input_dataset['clean_string'], truncation=True)

ds_tokenized = ds_test.map(tf_preprocess, batched=True)
ds_tokenized

In [None]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer=tf_tokenizer, return_tensors='tf')

In [None]:
dict_or_smth = ds_tokenized.train_test_split(test_size=0.1, shuffle=True)

dict_or_smth

tf_train_set = dict_or_smth["train"].to_tf_dataset(

    columns=["attention_mask", "input_ids", "label"],

    shuffle=True,

    batch_size=16,

    collate_fn=collator,

)

tf_validation_set = dict_or_smth["test"].to_tf_dataset(

    columns=["attention_mask", "input_ids", "label"],

    shuffle=False,

    batch_size=16,

    collate_fn=collator,

)

In [None]:
for elem in tf_train_set.take(1):
  print (elem)

In [None]:

from transformers import create_optimizer

import tensorflow as tf

batch_size = 16

num_epochs = 5

batches_per_epoch = len(dict_or_smth["train"]) // batch_size

total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)

In [None]:
#from sklearn.model_selection import train_test_split

#break into train,test,val splits
#tf_X_train_val, tf_X_test, tf_y_train_val, tf_y_test = train_test_split(clean_all['clean_string'], clean_all['REMOVED'], 
#                                                                        random_state=8492, test_size=0.20)
#tf_X_train, tf_X_val, tf_y_train, tf_y_val = train_test_split(tf_X_train_val, tf_y_train_val,
#                                                             random_state=1228, test_size=0.125)

#print(len(tf_X_test))
#print(len(tf_X_val))
#print(len(tf_X_train))

In [None]:

#def tf_process(input_series):
#    result = tf_tokenizer(input_series.to_list(), truncation=True)
#    return result

#tf_X_train_processed = tf_process(tf_X_train)

#tf_X_val_processed = tf_process(tf_X_val)

#tf_X_test_processed = tf_process(tf_X_test)
