In [1]:
import tensorflow_datasets as tfds
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# Step 1: Load the IMDb dataset
imdb_dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_data, test_data = imdb_dataset['train'], imdb_dataset['test']

# Helper function to convert tf.data.Dataset to texts and labels
def get_texts_and_labels(dataset):
    texts, labels = [], []
    for text, label in tfds.as_numpy(dataset):
        texts.append(str(text, 'utf-8'))
        labels.append(label)
    return texts, np.array(labels)

# Step 2: Preprocess the dataset
X_train, y_train = get_texts_and_labels(train_data)
X_test, y_test = get_texts_and_labels(test_data)

2024-02-09 19:17:31.241651: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-09 19:17:31.573221: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-09 19:17:31.573302: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-09 19:17:31.638608: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-09 19:17:31.774591: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-09 19:17:31.776084: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [3]:
# Step 3: Vectorize the text using TF-IDF
tfidf_vectorizer =TfidfVectorizer(min_df=5, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [4]:
# Step 4: Train the LogisticRegression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [5]:
# Step 5: Evaluate the model on the test set
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.87964


In [6]:
import xgboost as xgb


xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train_tfidf, y_train)
xgb_predictions = xgb_clf.predict(X_test_tfidf)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f'XGBoost Accuracy: {xgb_accuracy}')

XGBoost Accuracy: 0.84976


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)  # Keep only the top 10,000 most frequently occurring words
tokenizer.fit_on_texts(X_train)


# Convert texts to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [4]:
# Pad sequences to have the same length
maxlen = 100  # Maximum length of sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen)

In [5]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=maxlen))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1329473 (5.07 MB)
Trainable params: 1329473 (5.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [6]:
history = model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.8352000117301941


In [9]:
model.save('my_model.keras')

In [10]:
import pickle

# Saving the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [11]:
from keras.preprocessing.sequence import pad_sequences


def preprocess_text(text, tokenizer, maxlen=100):
    # Convert texts to sequence of integers
    seq = tokenizer.texts_to_sequences([text])
    # Pad sequences to have the same length
    padded_seq = pad_sequences(seq, maxlen=maxlen)
    return padded_seq

In [12]:
text = "Ugly cunts need to die!"
processed_text = preprocess_text(text, tokenizer)
prediction = model.predict(processed_text)
print("Prediction:", prediction)


Prediction: [[0.00449693]]


In [13]:
text = "Beatiful and Terrific!"
processed_text = preprocess_text(text, tokenizer)
prediction = model.predict(processed_text)
print("Prediction:", prediction)


Prediction: [[0.8655527]]
