In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import string
import nltk


# Read Data

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Racist Detection/data.csv')

In [None]:
data.head()

In [None]:
data['tweet']

In [None]:
data['label'].value_counts()

In [None]:
data.info()

# Preprocessing

In [4]:
# removes pattern in the input text
def remove_pattern(text, pattern):
    # find the text with the pattern
    matched_text = re.findall(pattern, text)
    # replace all matchiing text with empty string
    for word in matched_text:
        text = re.sub(word, "",text)

    return text

In [5]:
# remove twitter handels (@user)

# define the pattern
adsign_user_pattern = "@[\w]*"

# try to remove the pattern from dataset
data['clean_tweet'] = np.vectorize(remove_pattern)(data['tweet'], adsign_user_pattern)

data.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation


In [6]:
# remove special characters , number and punctuations

# define the pattern
punctuations_pattern = "[^a-zA-Z#]"

# replace the pattern with space
data['clean_tweet'] = data['clean_tweet'].str.replace(punctuations_pattern, " ")

data.head()

  data['clean_tweet'] = data['clean_tweet'].str.replace(punctuations_pattern, " ")


Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now #motivation


In [7]:
#remove short words
data['clean_tweet'] = data['clean_tweet'].apply(lambda tweet: " ".join([word for word in tweet.split() if len(word)>3]))

data.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause they offer wheelchai...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model love take with time
4,5,0,factsguide: society now #motivation,factsguide society #motivation


In [8]:
#individual words considered as tokens
tokenized_tweet = data['clean_tweet'].apply(lambda tweet : tweet.split())
tokenized_tweet.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: clean_tweet, dtype: object

In [9]:
#stem the words
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence:[stemmer.stem(word) for word in sentence])
tokenized_tweet.head()

0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, caus, they, offer, whee...
2                              [bihday, your, majesti]
3                     [#model, love, take, with, time]
4                         [factsguid, societi, #motiv]
Name: clean_tweet, dtype: object

In [10]:
# combine words into single sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])

data['clean_tweet'] = tokenized_tweet
data.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunct selfish drag kid into dys...
1,2,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit caus they offer wheelchair ...
2,3,0,bihday your majesty,bihday your majesti
3,4,0,#model i love u take with u all the time in ...,#model love take with time
4,5,0,factsguide: society now #motivation,factsguid societi #motiv


# Ready the data to inject to the model

In [11]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
print(stopwords)

{'its', 'why', 'these', 'wasn', 'ourselves', 'yourselves', 'she', 'what', 'did', "shouldn't", 'of', 'y', 'between', 'until', "doesn't", 'isn', "hadn't", 'wouldn', 're', 'had', 'or', "mustn't", 'my', "you'd", 'from', 'his', 'over', 'not', 'theirs', 'a', 'to', 'yours', 'too', 'needn', 'himself', 'for', "isn't", 'where', 'up', 'don', 't', 'because', 'further', 'in', 'out', 'each', 'ma', 'can', 'herself', 'are', 'during', 'those', 'such', 'all', 'which', 'aren', 'doing', 'll', 'other', 'it', 'nor', 'no', 'her', "mightn't", 'i', 'them', 'the', 'if', 'whom', 'themselves', 'just', 'he', "couldn't", 'do', 'yourself', 'you', 'some', 'here', 'both', 'very', 'below', 'off', 'few', 'hadn', 'while', "shan't", 'under', "she's", 'they', 'hers', 'our', 'again', 'him', 'and', 'that', 'against', 'after', 's', 'so', "should've", 'm', "won't", "that'll", 'then', 'when', "you're", "needn't", 'into', "haven't", 'your', 'been', 'any', "wasn't", 'should', 'who', 'own', 'haven', 'am', 'weren', "you'll", 'now',

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
lem = WordNetLemmatizer()

In [13]:
from nltk import word_tokenize
dataset = pd.DataFrame(columns=('clean_tweet', 'label'))
for index, row in data.iterrows():

    # tokenize the words
    new_prompt = word_tokenize(row['clean_tweet'])
    # remove stopwords
    new_prompt = [word for word in new_prompt if not word in stopwords]
    #stemming the words
    new_prompt = [stemmer.stem(word) for word in new_prompt]
    # lemmatize the word
    new_prompt = [lem.lemmatize(word).replace('#', ' ') for word in new_prompt]
    dataset.loc[index] = {
        'clean_tweet': ' '.join(new_prompt),
        'label':row['label']
    }

In [14]:
# Tokenize text data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(dataset['clean_tweet'])
sequences = tokenizer.texts_to_sequences(dataset['clean_tweet'])

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Pad sequences to a maximum length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
labels = dataset['label']

# Training the model 

In [19]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU


# Define LSTM model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [20]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='', monitor='val_accuracy', mode='max', save_best_only=True)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model (skipping training for brevity)
history = model.fit(padded_sequences, labels, epochs=20, batch_size=15, validation_split=0.25, callbacks=[early_stopping, model_checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
