### Offensive Language Model training

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cd /content/gdrive/MyDrive/devshouse24/

In [None]:
!ls

gdrive	sample_data


In [55]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import string

In [13]:
text_df = pd.read_csv("/content/gdrive/MyDrive/devshouse24/OLID/olid-training-v1.0.tsv",delimiter="\t")
df = text_df[["id","tweet","subtask_a"]]
df.head()

Unnamed: 0,id,tweet,subtask_a
0,86426,@USER She should ask a few native Americans wh...,OFF
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF
2,16820,Amazon is investigating Chinese employees who ...,NOT
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13240 entries, 0 to 13239
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         13240 non-null  int64 
 1   tweet      13240 non-null  object
 2   subtask_a  13240 non-null  object
dtypes: int64(1), object(2)
memory usage: 310.4+ KB


In [15]:
print(df['subtask_a'].value_counts())

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64


In [107]:
def clean_tweets(df):
    punctuations = string.punctuation
    df.loc[:, 'tweet'] = df.tweet.str.replace('@USER', '') #Remove mentions (@USER)
    df.loc[:, 'tweet'] = df.tweet.str.replace('URL', '') #Remove URLs
    df.loc[:, 'tweet'] = df.tweet.str.replace('&amp', 'and') #Replace ampersand (&) with and
    df.loc[:, 'tweet'] = df.tweet.str.replace('&lt','') #Remove &lt
    df.loc[:, 'tweet'] = df.tweet.str.replace('&gt','') #Remove &gt
    df.loc[:, 'tweet'] = df.tweet.str.replace('\d+','') #Remove numbers
    df.loc[:, 'tweet'] = df.tweet.str.lower() #Lowercase
    #Remove punctuations
    for punctuation in punctuations:
        df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')

    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    #Remove emojis
    df.loc[:, 'tweet'] = df.tweet.str.strip() #Trim leading and trailing whitespaces

In [17]:
clean_tweets(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'tweet'] = df.tweet.str.replace('@USER', '') #Remove mentions (@USER)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'tweet'] = df.tweet.str.replace('URL', '') #Remove URLs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'tweet'] = df.tweet.str.replace('&amp', 'and') #R

In [18]:
print(df)

          id                                              tweet subtask_a
0      86426  she should ask a few native americans what the...       OFF
1      90194                    go home youre drunk  maga trump       OFF
2      16820  amazon is investigating chinese employees who ...       NOT
3      62688  someone shouldvetaken this piece of shit to a ...       OFF
4      43605  obama wanted liberals and illegals to move int...       NOT
...      ...                                                ...       ...
13235  95338  sometimes i get strong vibes from people and t...       OFF
13236  67210  benidorm   creamfields   maga    not too shabb...       NOT
13237  82921   and why report this garbage  we dont give a crap       OFF
13238  27429                                              pussy       OFF
13239  46552  spanishrevenge vs justice humanrights and free...       NOT

[13240 rows x 3 columns]


In [46]:
train_olid = df.drop(['id'],axis=1)
train_olid.head()

Unnamed: 0,tweet,subtask_a
0,she should ask a few native americans what the...,OFF
1,go home youre drunk maga trump,OFF
2,amazon is investigating chinese employees who ...,NOT
3,someone shouldvetaken this piece of shit to a ...,OFF
4,obama wanted liberals and illegals to move int...,NOT


In [47]:
train = pd.get_dummies(train_olid,columns=['subtask_a'])

In [27]:
train

Unnamed: 0,tweet,subtask_a_NOT,subtask_a_OFF
0,she should ask a few native americans what the...,0,1
1,go home youre drunk maga trump,0,1
2,amazon is investigating chinese employees who ...,1,0
3,someone shouldvetaken this piece of shit to a ...,0,1
4,obama wanted liberals and illegals to move int...,1,0
...,...,...,...
13235,sometimes i get strong vibes from people and t...,0,1
13236,benidorm creamfields maga not too shabb...,1,0
13237,and why report this garbage we dont give a crap,0,1
13238,pussy,0,1


In [142]:
path = "/content/gdrive/MyDrive/devshouse24/toxicity/"
EMBEDDING_FILE=f'{path}glove.6B.50d.txt'

In [200]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 #

In [229]:
list_sentences_train = train["tweet"].fillna("_na_").values
list_classes = ["OFF_NOT","OFF_OFF"]
y = train[list_classes].values.astype("int64")

In [230]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

In [231]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

In [232]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))


In [233]:
all_embs = np.stack(list(embeddings_index.values()))
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [234]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [235]:
def create_model():
  model = tf.keras.Sequential([
      Input(shape=(maxlen,)),
      Embedding(max_features, embed_size, weights=[embedding_matrix]),
      Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),
      GlobalMaxPool1D(),
      Dense(50, activation="relu"),
      Dropout(0.1),
      Dense(2, activation="sigmoid"),
  ])

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [236]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=1)

In [237]:
model = create_model()
model.fit(X_t, y, batch_size=32, epochs=10, validation_split=0.1,callbacks=[callback]);

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [92]:
train_solid = pd.read_csv("/content/gdrive/MyDrive/devshouse24/OLID/test_a_tweets_all.tsv",delimiter="\t")
train_solid.head()

Unnamed: 0,id,tweet
0,B0,Has @USER quit? I've not heard of any #knifecr...
1,B1,"In celebration of Emancipation Day, we urge yo..."
2,B2,@USER @USER It’d be a literal dream come true ...
3,B3,Brilliant news to read that Hoggy has signed a...
4,B4,@USER She speaks of the truth 😌


In [109]:
clean_tweets(train_solid)

  df.loc[:, 'tweet'] = df.tweet.str.replace('\d+','') #Remove numbers
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')


In [110]:
train_solid

Unnamed: 0,id,tweet
0,B0,has quit ive not heard of any knifecrime today
1,B1,in celebration of emancipation day we urge you...
2,B2,itd be a literal dream come true for me to w...
3,B3,brilliant news to read that hoggy has signed a...
4,B4,she speaks of the truth
...,...,...
5990,BC2101,nothing about trump is human or normal unles...
5991,BC2102,oh shit that sounds like a cool time tho
5992,BC2103,i gotta say that shit to you one time so i c...
5993,BC2104,trump does everything he can to destroy the f...


In [111]:
solid_labels = pd.read_csv("/content/gdrive/MyDrive/devshouse24/OLID/test_a_labels_all.csv")

In [113]:
solid_labels.rename(columns = {'BC0':'id'}, inplace = True)

In [116]:
train_df_solid = train_solid.merge(solid_labels,on="id")

In [125]:
train_df_solid.columns[2]

'OFF'

In [127]:
train_df_solid = pd.get_dummies(train_df_solid,columns=["OFF"])

In [128]:
train_df_solid

Unnamed: 0,id,tweet,OFF_NOT,OFF_OFF
0,B0,has quit ive not heard of any knifecrime today,1,0
1,B1,in celebration of emancipation day we urge you...,1,0
2,B2,itd be a literal dream come true for me to w...,1,0
3,B3,brilliant news to read that hoggy has signed a...,1,0
4,B4,she speaks of the truth,1,0
...,...,...,...,...
5987,BC2101,nothing about trump is human or normal unles...,0,1
5988,BC2102,oh shit that sounds like a cool time tho,0,1
5989,BC2103,i gotta say that shit to you one time so i c...,0,1
5990,BC2104,trump does everything he can to destroy the f...,0,1


In [129]:
train.head()

Unnamed: 0,tweet,subtask_a_NOT,subtask_a_OFF
0,she should ask a few native americans what the...,0,1
1,go home youre drunk maga trump,0,1
2,amazon is investigating chinese employees who ...,1,0
3,someone shouldvetaken this piece of shit to a ...,0,1
4,obama wanted liberals and illegals to move int...,1,0


In [130]:
train.rename(columns = {'subtask_a_NOT':'OFF_NOT','subtask_a_OFF':'OFF_OFF'}, inplace = True)

In [131]:
train

Unnamed: 0,tweet,OFF_NOT,OFF_OFF
0,she should ask a few native americans what the...,0,1
1,go home youre drunk maga trump,0,1
2,amazon is investigating chinese employees who ...,1,0
3,someone shouldvetaken this piece of shit to a ...,0,1
4,obama wanted liberals and illegals to move int...,1,0
...,...,...,...
13235,sometimes i get strong vibes from people and t...,0,1
13236,benidorm creamfields maga not too shabb...,1,0
13237,and why report this garbage we dont give a crap,0,1
13238,pussy,0,1


In [132]:
train = pd.concat([train,train_df_solid])

In [133]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19232 entries, 0 to 5991
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tweet    19232 non-null  object
 1   OFF_NOT  19232 non-null  uint8 
 2   OFF_OFF  19232 non-null  uint8 
 3   id       5992 non-null   object
dtypes: object(2), uint8(2)
memory usage: 488.3+ KB


In [137]:
train = train.drop('id',axis=1)

In [138]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19232 entries, 0 to 5991
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tweet    19232 non-null  object
 1   OFF_NOT  19232 non-null  uint8 
 2   OFF_OFF  19232 non-null  uint8 
dtypes: object(1), uint8(2)
memory usage: 338.1+ KB


In [140]:
train = train.drop_duplicates(subset=['tweet'])

In [141]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18710 entries, 0 to 5991
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tweet    18710 non-null  object
 1   OFF_NOT  18710 non-null  uint8 
 2   OFF_OFF  18710 non-null  uint8 
dtypes: object(1), uint8(2)
memory usage: 328.9+ KB


In [217]:
model.save_weights('./final_checkpoints/my_checkpoint')

In [219]:
!zip -r final_checkpoints.zip final_checkpoints

  adding: final_checkpoints/ (stored 0%)
  adding: final_checkpoints/my_checkpoint.index (deflated 62%)
  adding: final_checkpoints/my_checkpoint.data-00000-of-00001 (deflated 5%)
  adding: final_checkpoints/checkpoint (deflated 49%)


In [161]:
def predict_probs(model,x:str):
  sample_token = tokenizer.texts_to_sequences([x])
  sample_X_t = pad_sequences(sample_token, maxlen=maxlen)
  y = model.predict(sample_X_t,verbose=0)
  print(y)

In [216]:
predict_probs(model,"The project failed because Jane didnt do her part")

[[0.99704057 0.00275337]]


In [170]:
path = "/content/gdrive/MyDrive/devshouse24/toxicity/"
EMBEDDING_FILE=f'{path}glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [182]:
new_dataset = pd.read_csv(TRAIN_DATA_FILE)
new_dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [183]:
new_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [184]:
new_dataset = new_dataset[['comment_text','toxic']]

In [185]:
new_dataset.head()

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [186]:
new_dataset.rename(columns = {'comment_text':'tweet','toxic':'OFF'}, inplace = True)

In [187]:
def clean_tweets(df):
    punctuations = string.punctuation
    df.loc[:, 'tweet'] = df.tweet.str.replace('@USER', '') #Remove mentions (@USER)
    df.loc[:, 'tweet'] = df.tweet.str.replace('URL', '') #Remove URLs
    df.loc[:, 'tweet'] = df.tweet.str.replace('&amp', 'and') #Replace ampersand (&) with and
    df.loc[:, 'tweet'] = df.tweet.str.replace('&lt','') #Remove &lt
    df.loc[:, 'tweet'] = df.tweet.str.replace('&gt','') #Remove &gt
    df.loc[:, 'tweet'] = df.tweet.str.replace('\d+','') #Remove numbers
    df.loc[:, 'tweet'] = df.tweet.str.replace('\n',' ') #Remove numbers
    df.loc[:, 'tweet'] = df.tweet.str.lower() #Lowercase
    #Remove punctuations
    for punctuation in punctuations:
        df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')

    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    #Remove emojis
    df.loc[:, 'tweet'] = df.tweet.str.strip() #Trim leading and trailing whitespaces

In [188]:
clean_tweets(new_dataset)

  df.loc[:, 'tweet'] = df.tweet.str.replace('\d+','') #Remove numbers
  df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')


In [189]:
new_dataset.head()

Unnamed: 0,tweet,OFF
0,explanation why the edits made under my userna...,0
1,daww he matches this background colour im seem...,0
2,hey man im really not trying to edit war its j...,0
3,more i cant make any real suggestions on impr...,0
4,you sir are my hero any chance you remember wh...,0


In [194]:
new_dataset

Unnamed: 0,tweet,OFF_0,OFF_1
0,explanation why the edits made under my userna...,1,0
1,daww he matches this background colour im seem...,1,0
2,hey man im really not trying to edit war its j...,1,0
3,more i cant make any real suggestions on impr...,1,0
4,you sir are my hero any chance you remember wh...,1,0
...,...,...,...
159566,and for the second time of asking when your vi...,1,0
159567,you should be ashamed of yourself that is a ...,1,0
159568,spitzer umm theres no actual article for pro...,1,0
159569,and it looks like it was actually you who put ...,1,0


In [195]:
new_dataset.rename(columns = {'OFF_0':'OFF_NOT','OFF_1':'OFF_OFF'}, inplace = True)

In [196]:
train = pd.concat([new_dataset,train])

In [197]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178281 entries, 0 to 5991
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   tweet    178281 non-null  object
 1   OFF_NOT  178281 non-null  uint8 
 2   OFF_OFF  178281 non-null  uint8 
dtypes: object(1), uint8(2)
memory usage: 3.1+ MB


In [214]:
new_dataset['OFF_OFF'].value_counts()

0    144277
1     15294
Name: OFF_OFF, dtype: int64

In [221]:
train.to_csv(path+'data.csv')

In [224]:
count_class_0, count_class_1 = train['OFF_OFF'].value_counts()

# Divide by class
df_class_0 = train[train['OFF_OFF'] == 0]
df_class_1 = train[train['OFF_OFF'] == 1]

In [225]:
df_class_0_under = df_class_0.sample(count_class_1)

In [228]:
train = pd.concat([df_class_0_under, df_class_1], axis=0)

In [238]:
model.save('offense_classification_v3.keras')