In [21]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khale\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Load the dataset
reddit_Data = pd.read_csv('Reddit_Data.csv') 

# Print the column names
print(reddit_Data.columns)

Index(['clean_comment', 'category'], dtype='object')


In [23]:
from nltk.tokenize import word_tokenize

def get_dict(words):
    sorted_words = sorted(set(words))  # Sort words and remove duplicates
    word2Ind = {word: i for i, word in enumerate(sorted_words)}
    Ind2word = {i: word for i, word in enumerate(sorted_words)}
    return word2Ind, Ind2word  # Manually retyped to remove U+00A0

In [24]:
import emoji
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))  # Load stopwords

def clean_comment(reddit_Data):
    if not isinstance(reddit_Data, str):  # Check if data is not a string
        reddit_Data = ""  # Replace NaN or non-string values with an empty string
    reddit_Data = reddit_Data.lower()  # Convert to lowercase
    reddit_Data = re.sub(r'http\S+|www\S+', '', reddit_Data)
    reddit_Data = emoji.replace_emoji(reddit_Data, replace='')  # Remove emojis
    reddit_Data = re.sub(r'[^a-zA-Z0-9\s]', '', reddit_Data)  # Remove special characters (except spaces)
    reddit_Data = re.sub(r'\s+', ' ', reddit_Data).strip()  # Remove extra spaces
    return reddit_Data


print(f'After cleaning:  {reddit_Data}')

After cleaning:                                             clean_comment  category
0       family mormon have never tried explain them t...         1
1      buddhism has very much lot compatible with chr...         1
2      seriously don say thing first all they won get...        -1
3      what you have learned yours and only yours wha...         0
4      for your own benefit you may want read living ...         1
...                                                  ...       ...
37244                                              jesus         0
37245  kya bhai pure saal chutiya banaya modi aur jab...         1
37246              downvote karna tha par upvote hogaya          0
37247                                         haha nice          1
37248             facebook itself now working bjp’ cell          0

[37249 rows x 2 columns]


In [25]:
# Extract text from the correct column
train_texts = reddit_Data['clean_comment'].tolist()
train_labels = reddit_Data['category'].tolist()  # Assuming 'category' is the sentiment label

# Print first few samples to verify
print(train_texts[:5])

print(train_labels[:5])

[' family mormon have never tried explain them they still stare puzzled from time time like some kind strange creature nonetheless they have come admire for the patience calmness equanimity acceptance and compassion have developed all the things buddhism teaches ', 'buddhism has very much lot compatible with christianity especially considering that sin and suffering are almost the same thing suffering caused wanting things shouldn want going about getting things the wrong way christian this would mean wanting things that don coincide with god will and wanting things that coincide but without the aid jesus buddhism could also seen proof god all mighty will and omnipotence certainly christians are lucky have one such christ there side but what about everyone else well many christians believe god grace salvation and buddhism god way showing grace upon others would also help study the things jesus said and see how buddha has made similar claims such rich man getting into heaven joke basica

In [26]:
# Print column names to check correct names
print(reddit_Data.columns)

# Ensure 'clean_comment' column exists and contains only strings
reddit_Data['clean_comment'] = reddit_Data['clean_comment'].fillna("").astype(str)

# Split dataset into train and test sets (if not already split)
train_texts = reddit_Data['clean_comment'].tolist()  # ✅ Train set
test_texts = reddit_Data['clean_comment'].tolist()   # ✅ Test set (if not separately defined)
reddit_texts = reddit_Data['clean_comment'].tolist()  # ✅ Twitter texts

# Apply cleaning function
train_texts = [clean_comment(text) for text in train_texts]
test_texts = [clean_comment(text) for text in test_texts]
reddit_texts = [clean_comment(text) for text in reddit_texts]

# Print samples to check correctness
print(train_texts[:5])


Index(['clean_comment', 'category'], dtype='object')
['family mormon have never tried explain them they still stare puzzled from time time like some kind strange creature nonetheless they have come admire for the patience calmness equanimity acceptance and compassion have developed all the things buddhism teaches', 'buddhism has very much lot compatible with christianity especially considering that sin and suffering are almost the same thing suffering caused wanting things shouldn want going about getting things the wrong way christian this would mean wanting things that don coincide with god will and wanting things that coincide but without the aid jesus buddhism could also seen proof god all mighty will and omnipotence certainly christians are lucky have one such christ there side but what about everyone else well many christians believe god grace salvation and buddhism god way showing grace upon others would also help study the things jesus said and see how buddha has made similar c

In [27]:
# Import necessary modules
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define vocabulary size and max sequence length
VOCAB_SIZE = 20000  # Max number of words in vocabulary
MAX_LENGTH = 200  # Max length of sequences (truncated or padded)

# Ensure texts are loaded
if 'train_texts' not in locals() or 'test_texts' not in locals() or 'reddit_texts' not in locals():
    print("Error: One or more text datasets (train_texts, test_texts, reddit_texts) are not defined!")
else:
    # Tokenizer: Fit on combined dataset
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(train_texts + reddit_texts)

    # Convert text to sequences
    train_sequences = tokenizer.texts_to_sequences(train_texts)
    test_sequences = tokenizer.texts_to_sequences(test_texts)
    reddit_sequences = tokenizer.texts_to_sequences(reddit_texts)

    # Pad sequences to ensure uniform shape
    train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
    test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
    twitter_padded = pad_sequences(reddit_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

    print("✅ Tokenization & Padding Complete!")


✅ Tokenization & Padding Complete!


In [28]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Model parameters
EMBEDDING_DIM = 128  # Size of word embeddings
RNN_UNITS = 64  # Number of RNN units

# Ensure training data exists
if 'train_padded' not in locals() or 'train_labels' not in locals():
    print("Error: Training data (train_padded) or labels (train_labels) are not defined!")
else:
    # Define RNN model
    rnn_model = Sequential([
        Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH, trainable=True),
        SimpleRNN(RNN_UNITS, activation='tanh', return_sequences=False),
        Dense(1, activation='sigmoid')  # Binary classification
    ])

    # Compile model
    rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Model summary
    rnn_model.summary()

    print("✅ RNN Model Created Successfully!")



✅ RNN Model Created Successfully!


In [30]:
from sklearn.model_selection import train_test_split

# Extract text and labels
reddit_texts = reddit_Data['clean_comment'].tolist()
reddit_labels = reddit_Data['category'].values  # Adjust this column name if necessary


# Split data: 80% training, 20% testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    reddit_texts, reddit_labels, test_size=0.2, random_state=42
)

# Convert labels to NumPy arrays for model compatibility
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [31]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [33]:
print(reddit_Data.columns)
print(reddit_Data['category'].unique())

Index(['clean_comment', 'category'], dtype='object')
[ 1 -1  0]


In [34]:
print(f"train_padded shape: {train_padded.shape}")
print(f"train_labels shape: {train_labels.shape}")
print(f"test_padded shape: {test_padded.shape}")
print(f"test_labels shape: {test_labels.shape}")

train_padded shape: (37249, 200)
train_labels shape: (29799,)
test_padded shape: (37249, 200)
test_labels shape: (7450,)


In [35]:
print(f"Total dataset size: {len(reddit_Data)}")
print(f"Unique labels: {reddit_Data['category'].unique()}")

Total dataset size: 37249
Unique labels: [ 1 -1  0]


In [36]:

from sklearn.model_selection import train_test_split

# Convert text & labels to lists
texts = reddit_Data['clean_comment'].tolist()
labels = reddit_Data['category'].values  # Ensure labels match texts

# Correct train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

print(f"Train size: {len(train_texts)}, Test size: {len(test_texts)}")
print(f"Train labels: {len(train_labels)}, Test labels: {len(test_labels)}")

Train size: 29799, Test size: 7450
Train labels: 29799, Test labels: 7450


In [37]:
# Tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts + test_texts)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

# Check new sizes
print(f"train_padded shape: {train_padded.shape}, train_labels shape: {len(train_labels)}")
print(f"test_padded shape: {test_padded.shape}, test_labels shape: {len(test_labels)}")

train_padded shape: (29799, 200), train_labels shape: 29799
test_padded shape: (7450, 200), test_labels shape: 7450


In [38]:
print("Unique values in train_labels:", np.unique(train_labels))
print("Unique values in test_labels:", np.unique(test_labels))

# Check for NaN values
print("NaN in train_labels:", np.isnan(train_labels).sum())
print("NaN in test_labels:", np.isnan(test_labels).sum())



# Remove NaN labels
valid_indices = ~np.isnan(train_labels)
train_labels = train_labels[valid_indices]
train_padded = train_padded[valid_indices]

Unique values in train_labels: [-1  0  1]
Unique values in test_labels: [-1  0  1]
NaN in train_labels: 0
NaN in test_labels: 0


In [39]:
train_labels = np.where(train_labels == -1, 0, train_labels)
test_labels = np.where(test_labels == -1, 0, test_labels)

In [40]:
print("Unique values in train_labels after cleaning:", np.unique(train_labels))
print("Unique values in test_labels after cleaning:", np.unique(test_labels))

Unique values in train_labels after cleaning: [0 1]
Unique values in test_labels after cleaning: [0 1]


In [41]:
train_padded = np.array(train_padded, dtype=np.float32)
test_padded = np.array(test_padded, dtype=np.float32)

In [42]:
from tensorflow.keras.optimizers import Adam

rnn_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

In [43]:
rnn_model.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_data=(test_padded, test_labels))


Epoch 1/5
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 38ms/step - accuracy: 0.6059 - loss: 0.6548 - val_accuracy: 0.6372 - val_loss: 0.6282
Epoch 2/5
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 39ms/step - accuracy: 0.7709 - loss: 0.5005 - val_accuracy: 0.7101 - val_loss: 0.5592
Epoch 3/5
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 39ms/step - accuracy: 0.8474 - loss: 0.3824 - val_accuracy: 0.7538 - val_loss: 0.5248
Epoch 4/5
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 39ms/step - accuracy: 0.8878 - loss: 0.3036 - val_accuracy: 0.7667 - val_loss: 0.5422
Epoch 5/5
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 39ms/step - accuracy: 0.9098 - loss: 0.2598 - val_accuracy: 0.7709 - val_loss: 0.5578


<keras.src.callbacks.history.History at 0x19e6b0530d0>