In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/fake_reviews_dataset.csv')
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


# **Text Preprocessing**

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    # Explicitly download 'punkt_tab' as indicated by the error message
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    processed_tokens = [
        lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
    ]
    return ' '.join(processed_tokens)

df['preprocessed_review_text'] = df['text_'].apply(preprocess_text)

print(df[['text_', 'preprocessed_review_text']].head())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                               text_  \
0  Love this!  Well made, sturdy, and very comfor...   
1  love it, a great upgrade from the original.  I...   
2  This pillow saved my back. I love the look and...   
3  Missing information on how to use it, but it i...   
4  Very nice set. Good quality. We have had the s...   

                            preprocessed_review_text  
0  love well made sturdy comfortable love itvery ...  
1   love great upgrade original ive mine couple year  
2            pillow saved back love look feel pillow  
3        missing information use great product price  
4                nice set good quality set two month  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

X_text_features = tfidf_vectorizer.fit_transform(df['preprocessed_review_text'])

print(X_text_features.shape)

print(X_text_features[:5].toarray())

(40432, 46248)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# **Task 1**

In [10]:
X = X_text_features
y = df['category']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,y_encoded, test_size=0.2, random_state=42)



In [11]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):

    self.features = torch.tensor(features.toarray(), dtype=torch.float32).to(device)
    self.labels = torch.tensor(labels, dtype=torch.long).to(device)

  def __len__(self):

    return len(self.features)

  def __getitem__(self, index):

    return self.features[index], self.labels[index]

In [None]:
train_dataset = CustomDataset(X_train, y_train)

In [None]:
test_dataset = CustomDataset(X_test, y_test)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
class MyNN(nn.Module):

  def __init__(self, num_features):

    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(num_features, 128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, 10),
        nn.Softmax(dim=1)
    )

  def forward(self, x):

    return self.model(x)

In [16]:
epochs = 100
learning_rate = 0.1

In [None]:
model = MyNN(X_train.shape[1]).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr= learning_rate)

In [None]:
for epoch in range(epochs):

  total_epoch_loss = 0

  for batch_features, batch_labels in train_loader:

    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    loss = criterion(outputs, batch_labels)

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    total_epoch_loss = total_epoch_loss + loss.item()

  avg_loss = total_epoch_loss/len(train_loader)
  print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')

Epoch: 1 , Loss: 2.3025715487747798
Epoch: 2 , Loss: 2.302226287574165
Epoch: 3 , Loss: 2.3018881011857344
Epoch: 4 , Loss: 2.301573870681491
Epoch: 5 , Loss: 2.3012653985042344
Epoch: 6 , Loss: 2.3009319140505884
Epoch: 7 , Loss: 2.3006255640813955
Epoch: 8 , Loss: 2.3002415966139482
Epoch: 9 , Loss: 2.2997815806875117
Epoch: 10 , Loss: 2.299134894793213
Epoch: 11 , Loss: 2.2979805639138804
Epoch: 12 , Loss: 2.295435478564779
Epoch: 13 , Loss: 2.288070178785814
Epoch: 14 , Loss: 2.266399923049414
Epoch: 15 , Loss: 2.240172380986421
Epoch: 16 , Loss: 2.22537504919904
Epoch: 17 , Loss: 2.216307942103963
Epoch: 18 , Loss: 2.206928984449786
Epoch: 19 , Loss: 2.1926834244030737
Epoch: 20 , Loss: 2.1752776977101806
Epoch: 21 , Loss: 2.161098935858534
Epoch: 22 , Loss: 2.151698087515096
Epoch: 23 , Loss: 2.144968232147307
Epoch: 24 , Loss: 2.1394035448669917
Epoch: 25 , Loss: 2.132845640653678
Epoch: 26 , Loss: 2.1233136055497783
Epoch: 27 , Loss: 2.110540406977235
Epoch: 28 , Loss: 2.097799

In [None]:
model.eval()

MyNN(
  (model): Sequential(
    (0): Linear(in_features=46248, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=10, bias=True)
    (5): Softmax(dim=1)
  )
)

In [None]:
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in test_loader:

    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print('Accuracy:')
print(correct/total)

Accuracy:
0.5958946457277111


# **Task 2**

In [12]:
X = X_text_features
y = df['label']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X,y_encoded, test_size=0.2, random_state=42)

In [19]:
class MyNN_Binary(nn.Module):

  def __init__(self, num_features):

    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(num_features, 128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, 1),
        nn.Sigmoid()
    )

  def forward(self, x):

    return self.model(x)

In [20]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):

    self.features = torch.tensor(features.toarray(), dtype=torch.float32).to(device)
    # For binary classification, labels need to be float32 and reshaped to (batch_size, 1)
    self.labels = torch.tensor(labels, dtype=torch.float32).to(device).unsqueeze(1)

  def __len__(self):

    return len(self.features)

  def __getitem__(self, index):

    return self.features[index], self.labels[index]

In [21]:
model = MyNN_Binary(X_train.shape[1]).to(device)

criterion = nn.BCELoss()

optimizer = optim.SGD(model.parameters(), lr= learning_rate)

In [23]:
train_dataset_1 = CustomDataset(X_train_1, y_train_1)
test_dataset_1 = CustomDataset(X_test_1, y_test_1)

train_loader_1 = DataLoader(train_dataset_1, batch_size=64, shuffle=True)
test_loader_1 = DataLoader(test_dataset_1, batch_size=64, shuffle=False)

In [24]:
for epoch in range(epochs):

  total_epoch_loss = 0

  for batch_features, batch_labels in train_loader_1:

    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    loss = criterion(outputs, batch_labels)

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    total_epoch_loss = total_epoch_loss + loss.item()

  avg_loss = total_epoch_loss/len(train_loader_1)
  print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')

Epoch: 1 , Loss: 0.6927388910012754
Epoch: 2 , Loss: 0.6893710199552091
Epoch: 3 , Loss: 0.6358637653085083
Epoch: 4 , Loss: 0.4404823963232191
Epoch: 5 , Loss: 0.37874047393384186
Epoch: 6 , Loss: 0.3410692724726888
Epoch: 7 , Loss: 0.3138420886614106
Epoch: 8 , Loss: 0.29624782679344824
Epoch: 9 , Loss: 0.2863330152990083
Epoch: 10 , Loss: 0.2562886918692485
Epoch: 11 , Loss: 0.2621472831741859
Epoch: 12 , Loss: 0.2469413351606239
Epoch: 13 , Loss: 0.23394475158730985
Epoch: 14 , Loss: 0.23806942567936046
Epoch: 15 , Loss: 0.21735098782794277
Epoch: 16 , Loss: 0.21634111281027907
Epoch: 17 , Loss: 0.21185265581598395
Epoch: 18 , Loss: 0.20022966982170998
Epoch: 19 , Loss: 0.2057601083806263
Epoch: 20 , Loss: 0.187561327451656
Epoch: 21 , Loss: 0.1750423485199689
Epoch: 22 , Loss: 0.17362628411869757
Epoch: 23 , Loss: 0.18738636455458144
Epoch: 24 , Loss: 0.17329097118066705
Epoch: 25 , Loss: 0.17522425875583067
Epoch: 26 , Loss: 0.18236909122776843
Epoch: 27 , Loss: 0.155504229309206

In [26]:
model.eval()

total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in test_loader_1:

    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    predicted = (outputs > 0.5).float() # Threshold at 0.5 for binary classification

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print('Accuracy:')
print(correct/total)

Accuracy:
0.8889575862495362
