In [10]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.utils import pad_sequences #Use keras.utils.pad_sequences instead of keras.preprocessing.sequence
from tensorflow.keras.preprocessing.text import Tokenizer #Import Tokenizer from tensorflow.keras.preprocessing.text
import nltk

# Step 1: Load Data
data = pd.read_csv('cleaned_youtube_comments.csv')  # Replace with your actual file path


NameError: name 'nltk' is not defined

In [11]:
data.isnull().sum()

Comments    40
dtype: int64

In [12]:
# Check for null values before removing
print(f"Null values before removing: {data['Comments'].isnull().sum()}")

# Remove rows with null values in the 'Comments' column
data.dropna(subset=['Comments'], inplace=True)

# Check for null values after removing
print(f"Null values after removing: {data['Comments'].isnull().sum()}")

# Save the data without null values back to CSV
#data.to_csv('/content/sample_data/cleaned_youtube_comments_without_nulls.csv', index=False)

#print("Data without null values has been saved to 'cleaned_youtube_comments_without_nulls.csv'.")


Null values before removing: 40
Null values after removing: 0


In [None]:
# Check existing columns
print("Original Columns:", data.columns)

Original Columns: Index(['https://bit.ly/30JSSPr Geo is the hub for all your Pakistani entertainment needs! Hit the bell icon and subscribe to become a part of our growing community, now hitting the USA'], dtype='object')


In [None]:
# Check current column name (to verify)
print("Original column names:", data.columns)

# Rename the column (replace the long name with 'Comments')
data.rename(columns={'https://bit.ly/30JSSPr Geo is the hub for all your Pakistani entertainment needs! Hit the bell icon and subscribe to become a part of our growing community, now hitting the USA': 'Comments'}, inplace=True)

# Print the updated DataFrame to confirm the column name change
print("Updated column names:", data.columns)
print(data.head())

Original column names: Index(['https://bit.ly/30JSSPr Geo is the hub for all your Pakistani entertainment needs! Hit the bell icon and subscribe to become a part of our growing community, now hitting the USA'], dtype='object')
Updated column names: Index(['Comments'], dtype='object')
                                            Comments
0  People slit their wrists when stupidly crazy i...
1  Wahaj ali and yumna jaldi ki cute jodi  hain❤❤...
2  Fucking drama,\nShe Don't know muntasir loves ...
3  Jis aurat mein inkar ki jarurat nhi hoti vo au...
4                         Kon kon dubara dekh rha h😂


In [13]:
# Remove the specific URL and message from the 'Comments' column
data['Comments'] = data['Comments'].str.replace(r'https://bit.ly/30JSSPr Geo is the hub for all your Pakistani entertainment needs! Hit the bell icon and subscribe to become a part of our growing community, now hitting the USA', '', regex=True)

# Print the cleaned data to confirm
print(data.head())

                                            Comments
0  People slit their wrists when stupidly crazy i...
1  Wahaj ali and yumna jaldi ki cute jodi  hain❤❤...
2  Fucking drama,\nShe Don't know muntasir loves ...
3  Jis aurat mein inkar ki jarurat nhi hoti vo au...
4                         Kon kon dubara dekh rha h😂


In [None]:
# Save the cleaned data back to a CSV file
data.to_csv('/content/sample_data/cleaned_youtube_comments.csv', index=False)

print("Cleaned data has been saved to 'cleaned_youtube_comments.csv'.")


Cleaned data has been saved to 'cleaned_youtube_comments.csv'.


In [14]:
data

Unnamed: 0,Comments
0,People slit their wrists when stupidly crazy i...
1,Wahaj ali and yumna jaldi ki cute jodi hain❤❤...
2,"Fucking drama,\nShe Don't know muntasir loves ..."
3,Jis aurat mein inkar ki jarurat nhi hoti vo au...
4,Kon kon dubara dekh rha h😂
...,...
154249,Lo ji Zafar saab... Am waiting from India 😊
154250,❤️❤️
154251,Waiting waiting waiting 😃
154252,yeah kre ga dhamaka inshaAllah trending hai. y...


In [15]:
data['Comments'] = data['Comments'].astype(str)  # Ensure comments are strings

In [16]:
data1 = pd.DataFrame (data, columns = ['Comment'])

In [17]:
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Aleena\AppData\Roaming\nltk_data...


True

In [18]:
# Step 2: Text Preprocessing
def text_processing(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'@\w+|#\w+|http\S+', '', text)  # Remove mentions, hashtags, and URLs
    text = re.sub('[%s]' % re.escape(string.punctuation), "", text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

data["Processed_Comment"] = data["Comments"].apply(text_processing)

In [19]:
# Step 3: Check Sentiment Counts
sentiments = SentimentIntensityAnalyzer()
data['Sentiment_Score'] = data['Processed_Comment'].apply(lambda x: sentiments.polarity_scores(x)['compound'])

In [20]:
# Categorizing Sentiments
data['Sentiment'] = np.where(data['Sentiment_Score'] >= 0.05, 'Positive',
                             np.where(data['Sentiment_Score'] <= -0.05, 'Negative', 'Neutral'))

# Display sentiment counts
sentiment_counts = data['Sentiment'].value_counts()
print("Sentiment Counts:")
print(sentiment_counts)

Sentiment Counts:
Sentiment
Neutral     81979
Positive    63979
Negative     8256
Name: count, dtype: int64


In [21]:
# Step 4: Sampling to Balance Data
df_positive = data[data['Sentiment'] == 'Positive']
df_negative = data[data['Sentiment'] == 'Negative']
df_neutral = data[data['Sentiment'] == 'Neutral']

# Upsample minority classes
df_negative_upsampled = resample(df_negative, replace=True, n_samples=len(df_positive), random_state=42)
df_neutral_upsampled = resample(df_neutral, replace=True, n_samples=len(df_positive), random_state=42)

# Combine majority class with upsampled minority classes
balanced_data = pd.concat([df_positive, df_negative_upsampled, df_neutral_upsampled])

# Display balanced sentiment counts
balanced_sentiment_counts = balanced_data['Sentiment'].value_counts()
print("Balanced Sentiment Counts:")
print(balanced_sentiment_counts)

Balanced Sentiment Counts:
Sentiment
Positive    63979
Negative    63979
Neutral     63979
Name: count, dtype: int64


In [22]:
# Step 5: Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(balanced_data["Processed_Comment"])
X = tokenizer.texts_to_sequences(balanced_data["Processed_Comment"])

max_len = 100  # Maximum length of sequences
X_pad = pad_sequences(X, maxlen=max_len)
y = balanced_data['Sentiment'].map({'Positive': 2, 'Negative': 0, 'Neutral': 1}).values  # Mapping labels to integers

In [23]:
# Step 6: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.3, random_state=42)

In [24]:
# Step 7: Build the LSTM Model
# Define your model
model = Sequential()

# Updated Embedding layer without `input_length`
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128))  # Removed `input_length`

# Other layers
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(3, activation='softmax'))
  # Adjust based on the number of sentiment classes





In [25]:

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])





In [26]:

# Step 8: Train the Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Step 9: Evaluate the Model
y_pred = np.argmax(model.predict(X_test), axis=-1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     19139
           1       0.99      0.99      0.99     19393
           2       0.99      0.99      0.99     19050

    accuracy                           0.99     57582
   macro avg       0.99      0.99      0.99     57582
weighted avg       0.99      0.99      0.99     57582



In [27]:


# Optional: Save the Model
model.save('sentimentk.keras')


In [28]:

# Optional: Save the Model
model.save('sentimentk.h5')

  saving_api.save_model(


In [29]:
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the model
model = load_model('sentiment_model.keras')

# Assuming you have a tokenizer saved, load it (you may need to implement this part)
# tokenizer = load_tokenizer('path_to_your_tokenizer.pkl')

# Sample new comments for prediction
new_comments = [
    "I love this product! It's fantastic!",
    "This is the worst experience I've ever had.",
    "It's okay, neither good nor bad."
]

# Preprocess the comments
def text_processing(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'@\w+|#\w+|http\S+', '', text)  # Remove mentions, hashtags, and URLs
    text = re.sub('[%s]' % re.escape(string.punctuation), "", text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

# Apply preprocessing
processed_comments = [text_processing(comment) for comment in new_comments]

# Tokenize and pad sequences
sequences = tokenizer.texts_to_sequences(processed_comments)
padded_sequences = pad_sequences(sequences, maxlen=100)  # Adjust maxlen as needed

# Make predictions
predictions = model.predict(padded_sequences)
predicted_classes = np.argmax(predictions, axis=1)

# Map class indices to labels
sentiment_labels = []
for pred in predicted_classes:
    if pred == 0:
        sentiment_labels.append('Negative')
    elif pred == 1:
        sentiment_labels.append('Neutral')
    else:
        sentiment_labels.append('Positive')

# Print results
for comment, label in zip(new_comments, sentiment_labels):
    print(f"Comment: {comment} - Sentiment: {label}")


TypeError: Error when deserializing class 'InputLayer' using config={'batch_shape': [None, 100], 'dtype': 'float32', 'sparse': False, 'name': 'input_layer'}.

Exception encountered: Unrecognized keyword arguments: ['batch_shape']

In [32]:
import numpy as np
import re
import string
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the model
model = load_model('sentimentk.keras')

# Load your tokenizer if you saved it
# You can use joblib or pickle to load your tokenizer, for example:
# import joblib
# tokenizer = joblib.load('path_to_your_tokenizer.pkl')

# Sample new comments for prediction
new_comments = [
    "I love this product! It's fantastic!",
    "This is the worst experience I've ever had.",
    "what about that."
]

# Preprocess the comments
def text_processing(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'@\w+|#\w+|http\S+', '', text)  # Remove mentions, hashtags, and URLs
    text = re.sub('[%s]' % re.escape(string.punctuation), "", text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

# Apply preprocessing
processed_comments = [text_processing(comment) for comment in new_comments]

# Tokenize and pad sequences
sequences = tokenizer.texts_to_sequences(processed_comments)
padded_sequences = pad_sequences(sequences, maxlen=100)  # Adjust maxlen as needed

# Make predictions
predictions = model.predict(padded_sequences)
predicted_classes = np.argmax(predictions, axis=1)

# Map class indices to labels
sentiment_labels = []
for pred in predicted_classes:
    if pred == 0:
        sentiment_labels.append('Negative')
    elif pred == 1:
        sentiment_labels.append('Neutral')
    else:
        sentiment_labels.append('Positive')

# Print results
for comment, label in zip(new_comments, sentiment_labels):
    print(f"Comment: {comment} - Sentiment: {label}")


Comment: I love this product! It's fantastic! - Sentiment: Positive
Comment: This is the worst experience I've ever had. - Sentiment: Negative
Comment: what about that. - Sentiment: Neutral


In [33]:
import pickle
# Saving the tokenizer used in training
with open('tokenizerk.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
