In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:
# Load the data
df = pd.read_csv("website_classification.csv", index_col='webpage_id')
df.head()

Unnamed: 0_level_0,website_url,cleaned_website_text,Category
webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [3]:
# Preprocess text data

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Encode labels
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# Apply preprocessing to the 'content' column
df['cleaned_content'] = df['cleaned_website_text'].apply(preprocess_text)

# Display the first few rows of the preprocessed DataFrame
print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshasaijagu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshasaijagu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                  website_url  \
webpage_id                                                      
0              https://www.booking.com/index.html?aid=1743217   
1                            https://travelsites.com/expedia/   
2                        https://travelsites.com/tripadvisor/   
3                       https://www.momondo.in/?ispredir=true   
4           https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...   

                                         cleaned_website_text  Category  \
webpage_id                                                                
0           official site good hotel accommodation big sav...        15   
1           expedia hotel book sites like use vacation wor...        15   
2           tripadvisor hotel book sites like previously d...        15   
3           cheap flights search compare flights momondo f...        15   
4           bot create free account create free account si...        15   

                  

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_content'], df['Category'], test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (1126,) (1126,)
Testing set shape: (282,) (282,)


### TF - IDF Vectorizer

In [5]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # You can adjust max_features based on your dataset size

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the TF-IDF vectors
print("Shape of TF-IDF vectors (training):", X_train_tfidf.shape)
print("Shape of TF-IDF vectors (testing):", X_test_tfidf.shape)

Shape of TF-IDF vectors (training): (1126, 500)
Shape of TF-IDF vectors (testing): (282, 500)


### Word Embedding

In [6]:
def average_word_embeddings(tokens, model, vector_size):
    valid_tokens = [token for token in tokens if token in model.wv.key_to_index]
    if valid_tokens:
        avg_embedding = np.mean([model.wv[token] for token in valid_tokens], axis=0)
    else:
        avg_embedding = np.zeros(vector_size)
    return avg_embedding

In [7]:
# Tokenize the text data for training and testing sets
tokenized_text_train = [word_tokenize(text) for text in X_train]
tokenized_text_test = [word_tokenize(text) for text in X_test]

# Train Word2Vec model on the training set
word2vec_model = Word2Vec(sentences=tokenized_text_train, vector_size=100, window=5, min_count=1, workers=4)

# Generate word embeddings for each document in the training set
word_embeddings_train = np.vstack([average_word_embeddings(tokens, word2vec_model, 100) for tokens in tokenized_text_train])

# Generate word embeddings for each document in the testing set
word_embeddings_test = np.vstack([average_word_embeddings(tokens, word2vec_model, 100) for tokens in tokenized_text_test])

# Display the shape of the word embeddings matrices for training and testing sets
print("Shape of word embeddings matrix (training):", word_embeddings_train.shape)
print("Shape of word embeddings matrix (testing):", word_embeddings_test.shape)

Shape of word embeddings matrix (training): (1126, 100)
Shape of word embeddings matrix (testing): (282, 100)


In [8]:
# Concatenate word embeddings and TF-IDF vectors for training and testing sets
X_train_features = np.concatenate([word_embeddings_train, X_train_tfidf.toarray()], axis=1)
X_test_features = np.concatenate([word_embeddings_test, X_test_tfidf.toarray()], axis=1)

In [9]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier on the combined features
svm_classifier.fit(X_train_features, y_train)

# Predict the categories for the testing data
y_pred = svm_classifier.predict(X_test_features)

# Display the classification report and accuracy score
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.54      0.76      0.63        17
           2       0.65      0.68      0.67        19
           3       0.88      1.00      0.94        22
           4       0.85      0.81      0.83        27
           5       0.94      0.88      0.91        17
           6       0.00      0.00      0.00         2
           7       0.95      0.82      0.88        22
           8       0.92      0.92      0.92        13
           9       0.95      0.91      0.93        23
          10       0.84      0.89      0.86        18
          11       0.94      0.85      0.89        20
          12       0.87      0.72      0.79        18
          13       0.92      0.96      0.94        23
          14       1.00      1.00      1.00        18
          15       1.00      0.95      0.97        19

    accuracy                           0.86       282
   

In [10]:
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the classifier on the combined features
xgboost.fit(X_train_features, y_train)

# Predict the categories for the testing data
y_pred = xgboost.predict(X_test_features)

# Display the classification report and accuracy score
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.65      0.76      0.70        17
           2       0.73      0.84      0.78        19
           3       0.95      0.86      0.90        22
           4       0.85      0.85      0.85        27
           5       0.89      0.94      0.91        17
           6       0.00      0.00      0.00         2
           7       0.95      0.86      0.90        22
           8       0.92      0.85      0.88        13
           9       0.80      0.87      0.83        23
          10       0.75      0.83      0.79        18
          11       0.94      0.85      0.89        20
          12       0.94      0.89      0.91        18
          13       0.95      0.87      0.91        23
          14       0.90      1.00      0.95        18
          15       0.95      0.95      0.95        19

    accuracy                           0.86       282
   

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Convert the labels to one-hot encoding for neural network
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

# Define the neural network model
def create_neural_network(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the model
input_dim = X_train_features.shape[1]
num_classes = len(label_encoder.classes_)
neural_network = create_neural_network(input_dim, num_classes)

# Train the neural network model
neural_network.fit(X_train_features, y_train_categorical, epochs=20, batch_size=32, validation_split=0.2, verbose=2)

# Predict the categories for the testing data
y_pred_categorical = neural_network.predict(X_test_features)
y_pred = np.argmax(y_pred_categorical, axis=1)

# Display the classification report and accuracy score
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


29/29 - 0s - 15ms/step - accuracy: 0.1133 - loss: 2.7275 - val_accuracy: 0.1991 - val_loss: 2.6186
Epoch 2/20
29/29 - 0s - 2ms/step - accuracy: 0.1889 - loss: 2.5051 - val_accuracy: 0.5088 - val_loss: 2.1979
Epoch 3/20
29/29 - 0s - 2ms/step - accuracy: 0.3311 - loss: 2.0884 - val_accuracy: 0.5885 - val_loss: 1.6570
Epoch 4/20
29/29 - 0s - 2ms/step - accuracy: 0.4922 - loss: 1.6572 - val_accuracy: 0.7080 - val_loss: 1.2653
Epoch 5/20
29/29 - 0s - 2ms/step - accuracy: 0.5878 - loss: 1.3004 - val_accuracy: 0.8186 - val_loss: 0.9196
Epoch 6/20
29/29 - 0s - 2ms/step - accuracy: 0.6444 - loss: 1.1540 - val_accuracy: 0.8186 - val_loss: 0.8206
Epoch 7/20
29/29 - 0s - 2ms/step - accuracy: 0.7167 - loss: 0.9316 - val_accuracy: 0.8230 - val_loss: 0.7229
Epoch 8/20
29/29 - 0s - 2ms/step - accuracy: 0.7500 - loss: 0.8776 - val_accuracy: 0.8451 - val_loss: 0.6690
Epoch 9/20
29/29 - 0s - 2ms/step - accuracy: 0.7900 - loss: 0.7029 - val_accuracy: 0.8496 - val_loss: 0.6273
Epoch 10/20
29/29 - 0s - 2ms/

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Topic Modelling:

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

# Define the number of topics
num_topics = 16

# Perform LDA on the TF-IDF vectors of the training set
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_train = lda_model.fit_transform(X_train_tfidf)
lda_test = lda_model.transform(X_test_tfidf)

# Display the shape of the LDA topic distributions
print("Shape of LDA topic distributions (training):", lda_train.shape)
print("Shape of LDA topic distributions (testing):", lda_test.shape)

# Combine word embeddings, TF-IDF vectors, and LDA topic distributions for training and testing sets
X_train_features = np.concatenate([word_embeddings_train, X_train_tfidf.toarray(), lda_train], axis=1)
X_test_features = np.concatenate([word_embeddings_test, X_test_tfidf.toarray(), lda_test], axis=1)

# Display the shape of the final feature matrices
print("Shape of combined features (training):", X_train_features.shape)
print("Shape of combined features (testing):", X_test_features.shape)

Shape of LDA topic distributions (training): (1126, 16)
Shape of LDA topic distributions (testing): (282, 16)
Shape of combined features (training): (1126, 616)
Shape of combined features (testing): (282, 616)


In [13]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42, probability=True)

# Train the classifier on the combined features
svm_classifier.fit(X_train_features, y_train)

# Predict the categories for the testing data
y_pred = svm_classifier.predict(X_test_features)

# Display the classification report and accuracy score
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.57      0.76      0.65        17
           2       0.73      0.84      0.78        19
           3       0.88      1.00      0.94        22
           4       0.88      0.81      0.85        27
           5       0.94      0.88      0.91        17
           6       0.00      0.00      0.00         2
           7       0.90      0.82      0.86        22
           8       0.92      0.92      0.92        13
           9       0.95      0.91      0.93        23
          10       0.89      0.89      0.89        18
          11       0.95      0.90      0.92        20
          12       0.87      0.72      0.79        18
          13       0.92      0.96      0.94        23
          14       1.00      1.00      1.00        18
          15       1.00      0.95      0.97        19

    accuracy                           0.88       282
   

In [14]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_features, y_train)
y_pred_rf = random_forest.predict(X_test_features)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy Score:", accuracy_score(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.67      0.82      0.74        17
           2       0.65      0.79      0.71        19
           3       0.91      0.95      0.93        22
           4       0.80      0.74      0.77        27
           5       0.88      0.88      0.88        17
           6       0.00      0.00      0.00         2
           7       0.95      0.86      0.90        22
           8       0.91      0.77      0.83        13
           9       0.88      0.91      0.89        23
          10       0.81      0.94      0.87        18
          11       1.00      0.85      0.92        20
          12       0.92      0.67      0.77        18
          13       0.91      0.91      0.91        23
          14       0.86      1.00      0.92        18
          15       0.90      0.95      0.92        19

    accuracy                           0.85

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the classifier on the combined features
xgboost.fit(X_train_features, y_train)

# Predict the categories for the testing data
y_pred = xgboost.predict(X_test_features)

# Display the classification report and accuracy score
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.71      0.71      0.71        17
           2       0.76      0.84      0.80        19
           3       0.95      0.82      0.88        22
           4       0.83      0.89      0.86        27
           5       0.89      0.94      0.91        17
           6       0.00      0.00      0.00         2
           7       0.95      0.86      0.90        22
           8       0.85      0.85      0.85        13
           9       0.91      0.87      0.89        23
          10       0.67      0.89      0.76        18
          11       1.00      0.85      0.92        20
          12       0.88      0.83      0.86        18
          13       0.91      0.87      0.89        23
          14       0.89      0.94      0.92        18
          15       0.86      0.95      0.90        19

    accuracy                           0.85       282
   

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Convert the labels to one-hot encoding for neural network
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

# Define the neural network model
def create_neural_network(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the model
input_dim = X_train_features.shape[1]
num_classes = len(label_encoder.classes_)
neural_network = create_neural_network(input_dim, num_classes)

# Train the neural network model
neural_network.fit(X_train_features, y_train_categorical, epochs=20, batch_size=32, validation_split=0.2, verbose=2)

# Predict the categories for the testing data
y_pred_categorical = neural_network.predict(X_test_features)
y_pred = np.argmax(y_pred_categorical, axis=1)

# Display the classification report and accuracy score
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


29/29 - 0s - 16ms/step - accuracy: 0.0967 - loss: 2.7254 - val_accuracy: 0.2566 - val_loss: 2.5412
Epoch 2/20
29/29 - 0s - 2ms/step - accuracy: 0.2289 - loss: 2.4045 - val_accuracy: 0.5575 - val_loss: 2.0786
Epoch 3/20
29/29 - 0s - 2ms/step - accuracy: 0.3311 - loss: 2.0259 - val_accuracy: 0.6770 - val_loss: 1.6078
Epoch 4/20
29/29 - 0s - 2ms/step - accuracy: 0.4722 - loss: 1.6382 - val_accuracy: 0.8097 - val_loss: 1.1556
Epoch 5/20
29/29 - 0s - 2ms/step - accuracy: 0.5944 - loss: 1.3072 - val_accuracy: 0.8142 - val_loss: 0.9173
Epoch 6/20
29/29 - 0s - 2ms/step - accuracy: 0.6722 - loss: 1.0678 - val_accuracy: 0.8274 - val_loss: 0.7807
Epoch 7/20
29/29 - 0s - 2ms/step - accuracy: 0.7311 - loss: 0.9448 - val_accuracy: 0.8363 - val_loss: 0.7329
Epoch 8/20
29/29 - 0s - 3ms/step - accuracy: 0.7578 - loss: 0.8183 - val_accuracy: 0.8451 - val_loss: 0.6533
Epoch 9/20
29/29 - 0s - 2ms/step - accuracy: 0.7933 - loss: 0.6586 - val_accuracy: 0.8363 - val_loss: 0.6141
Epoch 10/20
29/29 - 0s - 2ms/

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('rf', random_forest),
    ('xgb', xgboost)
], voting='soft')

voting_clf.fit(X_train_features, y_train)
y_pred_voting = voting_clf.predict(X_test_features)

print("Voting Classifier Classification Report:")
print(classification_report(y_test, y_pred_voting))
print("Voting Classifier Accuracy Score:", accuracy_score(y_test, y_pred_voting))

Voting Classifier Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.64      0.82      0.72        17
           2       0.74      0.89      0.81        19
           3       0.91      0.95      0.93        22
           4       0.92      0.85      0.88        27
           5       0.89      0.94      0.91        17
           6       0.00      0.00      0.00         2
           7       0.95      0.82      0.88        22
           8       0.92      0.92      0.92        13
           9       0.95      0.87      0.91        23
          10       0.73      0.89      0.80        18
          11       1.00      0.85      0.92        20
          12       0.93      0.78      0.85        18
          13       0.91      0.91      0.91        23
          14       1.00      1.00      1.00        18
          15       0.95      0.95      0.95        19

    accuracy                           