In [1]:
!pip install tensorflow




[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
import tensorflow as tf
import numpy as np
from tensorflow import keras

In [4]:
import pandas as pd

# Define column names
column_names = ['no.', 'name', 'result', 'review']

# Load the CSV file with column names
df = pd.read_csv('twitter_training.csv', names=column_names)

# Display the DataFrame
df.head()

Unnamed: 0,no.,name,result,review
0,N0,Name,Result,Review
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [13]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to C:\Users\Kevin Shibu
[nltk_data]     John\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Kevin Shibu
[nltk_data]     John\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Kevin Shibu
[nltk_data]     John\AppData\Roaming\nltk_data...


True

In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Step 1: Load the tweet data
df = pd.read_csv('twitter_training.csv', names=column_names)  # Assuming you have a CSV file containing the tweets

# Step 2: Preprocess the tweet text
import re

def preprocess_text(text):
    # Check if the input is a valid string
    if not isinstance(text, str):
        return ""

    # Remove mentions and hashtags
    text = re.sub(r"@\w+|\#\w+", "", text)
 
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text).lower()

    return text.strip()


    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and perform lemmatization
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    # Join the tokens back into a single string
    text = " ".join(tokens)

    return text

# Apply the preprocessing function to the tweet text column
df['preprocessed_text'] = df['review'].apply(preprocess_text)

# Step 3: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['result'], test_size=0.2, random_state=42)

# Step 4: Tokenize and pad the sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

max_length = 100  # Maximum sequence length
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)

# Step 5: Convert labels to one-hot encoding
labels = df['result'].unique()
num_classes = len(labels)

y_train = pd.Categorical(y_train, categories=labels)
y_train = pd.get_dummies(y_train)
y_test = pd.Categorical(y_test, categories=labels)
y_test = pd.get_dummies(y_test)

# Step 6: Build the CNN model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 100, input_length=max_length))
model.add(tf.keras.layers.Conv1D(64, 5, activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=4))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 7: Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32)

# Step 8: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

# Step 9: Make predictions on new data
new_reviews = ["I love this product!", "This movie is terrible."]
new_reviews = [preprocess_text(review) for review in new_reviews]
sequences_new = tokenizer.texts_to_sequences(new_reviews)
X_new = pad_sequences(sequences_new, maxlen=max_length)
predictions = model.predict(X_new)

for review, prediction in zip(new_reviews, predictions):
    sentiment = labels[prediction.argmax()]
    print(f"Review: {review}")
    print(f"Sentiment: {sentiment}")
    
    print()


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.6813551783561707
Accuracy: 0.8035750389099121
Review: i love this product
Sentiment: Positive

Review: this movie is terrible
Sentiment: Negative

