**Import Statements**

In [None]:
# Mount drive to find files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import necessary packages

# Used for data analysis
import pandas as pd

# Used for numerical and scientific computing
import numpy as np

# Used for natural language processing tasks
import nltk

# Used for finding files
import glob

# Large corpus of stopwrds
from nltk.corpus import stopwords

# Used to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Used to convert categorical labels to numerical form
from sklearn.preprocessing import LabelEncoder

# Used to easily keep track of and print out data matrics
from sklearn.metrics import classification_report

# Used tensorflow and keras to build the CNN model and train it
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer  # Converts text to sequences of integers
from tensorflow.keras.preprocessing.sequence import pad_sequences # Pads sequences to equal length
from tensorflow.keras.models import Sequential # Linear stack of layers for the model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout # Layers for CNN-based text classification

In [None]:
# Read in training data

train_files = glob.glob('/content/drive/MyDrive/Train_data/*.csv')

data_train_list = []
for file in train_files:
    data = pd.read_csv(file)
    data_train_list.append(data)

data_train = pd.concat(data_train_list, ignore_index=True)

In [None]:
# Remove stop words

nltk.download('stopwords')
stops = stopwords.words('english')

# Extend stopwords
stops.extend([",", ".", "!", "?", "'", '"', "I", "i", "n't", "'ve", "'d", "'s"])

# Clean up the data by making all words lowercase and removing stopwords
def clean_text(text):
    words = str(text).lower().split()
    return ' '.join([word for word in words if word not in stops])

# Apply text cleaning function to the review text column
data_train['cleaned_text'] = data_train['review/text'].astype(str).apply(clean_text)

# Encode categorical labels (e.g., genre categories) into numeric form
label_encoder = LabelEncoder()
data_train['label_encoded'] = label_encoder.fit_transform(data_train['categories'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Tokenize cleaned text data

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>") # Initialize tokenizer with a vocabulary size limit of 10000 and out-of-vocabulary token
tokenizer.fit_on_texts(data_train['cleaned_text'])  # Learn word indices from the training text

train_sequences = tokenizer.texts_to_sequences(data_train['cleaned_text']) # Convert text to sequences of integers
X_train = pad_sequences(train_sequences, maxlen=300, padding='post') # Pad sequences to a uniform length of 300
y_train = data_train['label_encoded'].values # Extract encoded labels as target values

In [None]:
# Build model and train

model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=300),  # Embedding layer to learn text features
    Conv1D(64, 5, activation='relu'),  # 1D convolution to detect local features in the text
    GlobalMaxPooling1D(), # Pooling layer to reduce sequence length and keep the most important features
    Dropout(0.1), # Dropout layer of 0.1 to avoid overfitting
    Dense(64, activation='relu'), # Another layer for feature extraction
    Dense(len(label_encoder.classes_), activation='softmax') # Output layer with softmax activation for multi-class classification
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the training data
model.fit(X_train, y_train, epochs=5, batch_size=64)

Epoch 1/5




[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 72ms/step - accuracy: 0.1248 - loss: 2.7165
Epoch 2/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 73ms/step - accuracy: 0.5878 - loss: 1.5316
Epoch 3/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 71ms/step - accuracy: 0.8126 - loss: 0.7339
Epoch 4/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 72ms/step - accuracy: 0.9099 - loss: 0.3719
Epoch 5/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 80ms/step - accuracy: 0.9598 - loss: 0.1897


<keras.src.callbacks.history.History at 0x7a951d27df90>

In [None]:
# Read testing data, tokenize it, and update X_test and y_test just like what was done above for the training data

test_files = glob.glob('/content/drive/MyDrive/project_test_data/*.csv')
data_test_list = []
for file in test_files:
    data_test = pd.read_csv(file)
    data_test_list.append(data_test)

data_test = pd.concat(data_test_list, ignore_index=True)

data_test['cleaned_text'] = data_test['review/text'].astype(str).apply(clean_text)

test_sequences = tokenizer.texts_to_sequences(data_test['cleaned_text'])
X_test = pad_sequences(test_sequences, maxlen=250, padding='post')
y_test = label_encoder.transform(data_test['categories'])

In [None]:
# Evaluate accuracy

loss, accuracy = model.evaluate(X_test, y_test) # Compute loss and accuracy on test data
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Generate predictions and evaluate classification performance
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
data_test['predicted_genre'] = label_encoder.inverse_transform(y_pred)

# Display the first 100 rows of actual vs. predicted genres
data_test[['review/text', 'categories', 'predicted_genre']].head(100)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5670 - loss: 1.7607
Test Loss: 1.7729
Test Accuracy: 0.5656
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
                           precision    recall  f1-score   support

Biography & Autobiography       0.39      0.24      0.30       100
     Business & Economics       0.63      0.62      0.63       100
                Computers       0.73      0.64      0.68       100
                  Cooking       0.74      0.96      0.83       100
                Education       0.91      1.00      0.95       100
   Family & Relationships       0.58      0.70      0.63       100
                  Fiction       0.25      0.12      0.16       100
                  History       0.66      0.86      0.75       100
         Juvenile Fiction       0.62      0.79      0.69       100
      Juvenile Nonfiction       0.35      0.25      0.29       100
       Literary Criticism       0.42      

Unnamed: 0,review/text,categories,predicted_genre
0,"Contrary to Mr. Long's review, I felt the book...",Biography & Autobiography,Biography & Autobiography
1,The Campbell and Reece Biology book is one of ...,Science,Science
2,I like that I can read this easily and quickly...,Computers,Computers
3,The Boston Globe 7/3/2003 ran a picture of Lau...,Biography & Autobiography,Biography & Autobiography
4,This story has to be one of the most astonishi...,History,History
...,...,...,...
95,For those who are serious about getting fit! T...,Sports & Recreation,Business & Economics
96,How this got printed I dont know. Wish I could...,Computers,Computers
97,I love this book. I just started it yesterday ...,Biography & Autobiography,Juvenile Nonfiction
98,"I rented this book from my local library, rene...",Cooking,Cooking
