## Word2Vec model creation
This script scrapes anime reviews from MyAnimeList Forum, preprocesses the text data, and trains a Word2Vec model on the cleaned text

In [None]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup

# Download nltk data if not already installed
nltk.download('punkt')
nltk.download('stopwords')

# Function returns: data (list of str) - a list of concatenated anime titles and descriptions from the reviews
def load_data_from_myanimelist(base_url, max_pages):
    data = []
    response = requests.get(base_url)
    # Check if the request was successful
    if response.status_code == 200:
        print("Getting data from myAnimeList...")
        for i in range(2, max_pages+1):
            url = f"{base_url}&p={i}"
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            # Extract review descriptions
            descriptions = [element.text.strip() for element in soup.find_all('div', {'class': 'text'})] 
            # Extract anime titles
            anime_titles = [element.text.strip() for element in soup.find_all('a', {'class': 'title ga-click'})]
            # Combine titles and descriptions
            data += [f"{title}. {description}" for title, description in zip(anime_titles, descriptions)]
    else:
        print(f"Error: {response.status_code}")
    return data


myanimelist_url = 'https://myanimelist.net/reviews.php?t=anime&filter_check=&filter_hide=&preliminary=on&spoiler=on'
reviews_data = load_data_from_myanimelist(myanimelist_url, max_pages=3200)

# Ensure reviews_data is not None
if reviews_data is None:
    reviews_data = []

# Function returns: tokens (list of str) - the cleaned and tokenized words from the input text 
def clean_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Convert to lowercase and remove non-alphabetic tokens
    tokens = [token.lower() for token in tokens if token.isalpha()]
    # Define and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

# Tokenize and clean the reviews data
tokenized_reviews = [clean_text(review) for review in reviews_data]

# Train a Word2Vec model on the tokenized reviews
model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)
print("Model built successfully.")

model.save('./models/anime_word2vec_model')

## Updating Word2Vec model with anime titles
This script is designed to update a pre-trained Word2Vec model with anime titles from a MyAnimeList CSV file. This ensures that all anime titles in the list are included in the model, preventing errors during vectorization in other scripts and enriching the vocabulary

In [3]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Function to clean and tokenize text
def clean_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens if token.isalpha()]
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        return tokens
    else:
        # Return an empty list if the input is not a string
        return []

# Load the pre-trained Word2Vec model
model = Word2Vec.load('./models/anime_word2vec_model')

# Load anime data from CSV file
df = pd.read_csv('./data/anime_list.csv', header=None, names=['user_score', 'title', 'genres', 'popularity'])

# Clean and tokenize anime titles
titles = df['title'].apply(clean_text).tolist()

# Filter out empty lists
titles = [title for title in titles if title]

# Update the Word2Vec model with the new data
model.build_vocab(titles, update=True)
model.train(titles, total_examples=model.corpus_count, epochs=model.epochs)


model.save('./models/anime_word2vec_model_updated')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dobre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dobre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
