In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
# For data manipulation and analysis using DataFrames
import pandas as pd
# For splitting data into training and testing sets
from sklearn.model_selection import train_test_split
# For converting text data into numerical features (Bag of Words)
from sklearn.feature_extraction.text import CountVectorizer
# For Naive Bayes classifier, a common choice for text classification
from sklearn.naive_bayes import MultinomialNB
# For various metrics used to evaluate the model's performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# For converting text data into TF-IDF (Term Frequency-Inverse Document Frequency) features
from sklearn.feature_extraction.text import TfidfVectorizer
# For Support Vector Machine classifier, another option for text classification
from sklearn.svm import SVC
# For data visualization
import matplotlib.pyplot as plt
# Enhanced data visualization based on matplotlib
import seaborn as sns
# For creating a sequential neural network model
from keras.models import Sequential
# For adding layers to the neural network
from keras.layers import Dense
# For regular expressions, used in text preprocessing
import re
# For working with PyTorch, a deep learning framework
import torch
# For loading and managing data for PyTorch models
from torch.utils.data import DataLoader, TensorDataset
# For using pre-trained BERT model for sequence classification
from transformers import BertTokenizer, BertForSequenceClassification
# Natural Language Toolkit for text processing
import nltk
# List of common stopwords
from nltk.corpus import stopwords
# For lemmatizing words
from nltk.stem import WordNetLemmatizer
# For tokenizing text into words
from nltk.tokenize import word_tokenize


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Downloading the NLTK tokenizer models, necessary for word tokenization.
nltk.download('punkt')

# Downloading the list of common stopwords from NLTK library.
nltk.download('stopwords')

# Downloading the WordNet lexical database, which is used for lemmatization.
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khans\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Read the TXT file
with open("train.ft.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Split lines into labels and texts
data = [line.strip().split(" ", 1) for line in lines]

# Separate labels and texts, handling the "__label__" prefix
labels = [line[0].replace("__label__", "") for line in data]
texts = [line[1] for line in data]

# Create a DataFrame
df_train = pd.DataFrame({'label': labels, 'text': texts})

# Save the DataFrame as CSV
df_train.to_csv("train_dataset.csv", index=False)

In [5]:

# Read the TXT file
with open("test.ft.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Split lines into labels and texts
data = [line.strip().split(" ", 1) for line in lines]

# Separate labels and texts, handling the "__label__" prefix
labels = [line[0].replace("__label__", "") for line in data]
texts = [line[1] for line in data]

# Create a DataFrame
df_test = pd.DataFrame({'label': labels, 'text': texts})

# Save the DataFrame as CSV
df_test.to_csv("test_dataset.csv", index=False)

In [6]:
df_train

Unnamed: 0,label,text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
3599995,1,Don't do it!!: The high chair looks great when...
3599996,1,"Looks nice, low functionality: I have used thi..."
3599997,1,"compact, but hard to clean: We have a small ho..."
3599998,1,what is it saying?: not sure what this book is...


In [7]:
df_test

Unnamed: 0,label,text
0,2,Great CD: My lovely Pat has one of the GREAT v...
1,2,One of the best game music soundtracks - for a...
2,1,Batteries died within a year ...: I bought thi...
3,2,"works fine, but Maha Energy is better: Check o..."
4,2,Great for the non-audiophile: Reviewed quite a...
...,...,...
399995,1,Unbelievable- In a Bad Way: We bought this Tho...
399996,1,"Almost Great, Until it Broke...: My son reciev..."
399997,1,Disappointed !!!: I bought this toy for my son...
399998,2,Classic Jessica Mitford: This is a compilation...


In [8]:
# Load the training dataset
train_df = pd.read_csv("train_dataset.csv")

# Load the testing dataset
test_df = pd.read_csv("test_dataset.csv")

# Use around 20000 samples from the training and testing datasets
train_samples = 20000
test_samples = 20000

# Limit the number of samples
train_df = train_df.sample(train_samples, random_state=42,replace='True')
test_df = test_df.sample(test_samples, random_state=42,replace='True')

# Preprocessing: Clean and tokenize text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = text.split()
    return ' '.join(tokens)

train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

# Preprocessing: TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['cleaned_text'])

# Convert labels to integers
y_train = train_df['label'].astype(int)
y_test = test_df['label'].astype(int)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Predict
y_pred = classifier.predict(X_test_tfidf)

# Calculate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


Classification Report:
               precision    recall  f1-score   support

           1       0.84      0.85      0.84      9962
           2       0.85      0.84      0.84     10038

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000



In [9]:
import pandas as pd
import re
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Load the data
train_df = pd.read_csv("train_dataset.csv")
test_df = pd.read_csv("test_dataset.csv")

# Advanced preprocessing: Tokenization and padding
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text'])
X_train_tokens = tokenizer.texts_to_sequences(train_df['text'])
X_test_tokens = tokenizer.texts_to_sequences(test_df['text'])

max_sequence_length = max(max(len(sequence) for sequence in X_train_tokens),
                          max(len(sequence) for sequence in X_test_tokens))
X_train_padded = pad_sequences(X_train_tokens, maxlen=max_sequence_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_tokens, maxlen=max_sequence_length, padding='post', truncating='post')

# Convert labels to numpy arrays
y_train = np.array(train_df['label'])
y_test = np.array(test_df['label'])

# Build the ANN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_sequence_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate the model
y_pred_prob = model.predict(X_test_padded)
y_pred = np.round(y_pred_prob).astype(int)

# Calculate classification report and confusion matrix
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           1       0.50      1.00      0.67    200000
           2       0.00      0.00      0.00    200000

    accuracy                           0.50    400000
   macro avg       0.25      0.50      0.33    400000
weighted avg       0.25      0.50      0.33    400000

Confusion Matrix:
 [[200000      0]
 [200000      0]]


In [10]:
# Load the training dataset
train_df = pd.read_csv("train_dataset.csv")

# Load the testing dataset
test_df = pd.read_csv("test_dataset.csv")

# Use around 20000 samples from the training and testing datasets
train_samples = 20000
test_samples = 20000

# Limit the number of samples
train_df = train_df.sample(train_samples, random_state=42,replace='True')
test_df = test_df.sample(test_samples, random_state=42,replace='True')

# Preprocessing: Clean text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

# Preprocessing: Tokenization, stopwords removal, and lemmatization
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text_advanced(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

train_df['processed_text'] = train_df['cleaned_text'].apply(preprocess_text_advanced)
test_df['processed_text'] = test_df['cleaned_text'].apply(preprocess_text_advanced)

# Preprocessing: TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['processed_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['processed_text'])

# Convert labels to integers
y_train = train_df['label'].astype(int)
y_test = test_df['label'].astype(int)

# Train an SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

# Predict
y_pred = svm_classifier.predict(X_test_tfidf)

# Calculate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           1       0.86      0.86      0.86      9962
           2       0.86      0.86      0.86     10038

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

