# DATA EXTRACTION AND LOADING

This notebook implements a Convolutional Neural Network (CNN) for sentence classification using the IMDb dataset. The model uses Word2Vec embeddings and PyTorch for training.


## Data Loading and Preprocessing

In this section, we load the IMDb dataset and perform text preprocessing, including tokenization, stopword removal, and padding.

In [40]:
# Data manipulation and processing
import numpy as np
import pandas as pd

# PyTorch for building the CNN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.functional as F

# NLP libraries for text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Embeddings (optional, for loading pre-trained ones like GloVe)
import gensim

# Plotting and visualization
import matplotlib.pyplot as plt

# Kaggle API for dataset
import kaggle

# Splitting the data
from sklearn.model_selection import train_test_split


# Jupyter notebook-specific libraries
%matplotlib inline

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [42]:
# I will be using the IMDB movies review dataset from kaggle
df = pd.read_csv('imdb-data/IMDB Dataset.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [43]:
# Here I am going to download NLTK resources 

# The punkt will vectorize words, encapsulating meaning within the sentence.
nltk.download('punkt')
nltk.download('punkt_tab')
# The stopwords will be used to vectorize things such as 'the' or 'and' which don't really contribute to the meaning directly.
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kalindadhikari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kalindadhikari/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kalindadhikari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
# Assigning the stopwords in english
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Makes all the text lower, so that captial letters don't have a different meaning from lowercase.
    text = text.lower()
    # Tokenizes the text into a vector.
    tokens = word_tokenize(text)
    # Removes the stop words from the tokens, as we said earlier, it does not effect the sentence
    # Ex: 'I love this movie', is the same as 'love movie' for our classification task. 
    tokens = [word for word in tokens if word.isalpha() not in stop_words]
    return tokens

In [45]:
df['tokens'] = df['review'].apply(preprocess_text)

In [46]:
from collections import defaultdict

# The CNN cannot directly work with textual data, so we need to assign an index to each word.

# Here we have a vocab dictionary that will automatically do this for us.
vocab = defaultdict(lambda: len(vocab))
vocab['<PAD>'] = 0     # padding used to pad shorters sentences to match longer ones. ('loved movie <pad>') ('loved movie greatly').
vocab['<UNK>'] = 1     # unknown used to map words that may be unknown to our vocabulary either during testing or training.

# Loading the tokens (each word) into the vocab
for tokens in df['tokens']:
    for token in tokens:
        vocab[token]

In [47]:
def tokens_to_indicies(tokens, vocab):
    new_list = []
    for token in tokens:
        new_list.append(vocab.get(token, vocab['<UNK>']))

    return new_list

df['indices'] = df['tokens'].apply(lambda x: tokens_to_indicies(x, vocab))
df.head(5)

Unnamed: 0,review,sentiment,tokens,indices
0,One of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,A wonderful little production. <br /><br />The...,positive,"[a, wonderful, little, production, ., <, br, /...","[57, 204, 205, 206, 20, 33, 34, 35, 36, 33, 34..."
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[128, 289, 26, 42, 57, 204, 290, 68, 291, 292,..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, 's, a, family, where, a, li...","[366, 367, 254, 57, 368, 93, 57, 205, 369, 166..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, 's, ``, love, in, the, time, ...","[417, 418, 254, 227, 419, 51, 4, 292, 3, 420, ..."


In [48]:
# Now we are going to PAD the input statements (or truncate if too small)
MAX_LEN = 100    # According to what was used in the paper

def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        return seq + [vocab['<PAD>']] * (max_len - len(seq))
    else:
        return seq[:max_len]

df['padded_indices'] = df['indices'].apply(lambda x: pad_sequence(x, MAX_LEN))

In [49]:
df.head()

Unnamed: 0,review,sentiment,tokens,indices,padded_indices
0,One of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,A wonderful little production. <br /><br />The...,positive,"[a, wonderful, little, production, ., <, br, /...","[57, 204, 205, 206, 20, 33, 34, 35, 36, 33, 34...","[57, 204, 205, 206, 20, 33, 34, 35, 36, 33, 34..."
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[128, 289, 26, 42, 57, 204, 290, 68, 291, 292,...","[128, 289, 26, 42, 57, 204, 290, 68, 291, 292,..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, 's, a, family, where, a, li...","[366, 367, 254, 57, 368, 93, 57, 205, 369, 166...","[366, 367, 254, 57, 368, 93, 57, 205, 369, 166..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, 's, ``, love, in, the, time, ...","[417, 418, 254, 227, 419, 51, 4, 292, 3, 420, ...","[417, 418, 254, 227, 419, 51, 4, 292, 3, 420, ..."


In [50]:
df_train, df_test = train_test_split(df, test_size = .2, random_state = 42)

In [52]:
class TextDataset(Dataset):
    def __init__(self, dataset):
       self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # loading the sentences into a tensor
        sentence = torch.tensor(self.data.iloc[idx]['padded_indices'], dtype=torch.long)
        # loading the labels into a tensor
        label = torch.tensor(1 if self.dataset['sentiment'] == 1 else 0, dtype=torch.long)
        
        return sentence, label  

In [None]:
train_dataset = TextDataset(train_df)
train_loader = DataLoader(train_dataset,
                          batch_size=32,
                          shuffle=True,
                          num_workers=2)