In [None]:
!pip install scikit-learn
# !pip install embeddings
!pip install sent2vec

In [None]:
# Generate embeddings of text features
!python -m spacy download en_core_web_md

In [None]:
import pandas as pd
import numpy as np
import json
import gzip
import spacy
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Step 1

### Subsample full interactions for development

In [None]:
full_df = pd.read_csv('goodreads_interactions.csv')

In [None]:
len(full_df)

In [None]:
full_df.head()

In [None]:
len(full_df['book_id'].unique())

In [None]:
len(full_df['user_id'].unique())

In [None]:
user_id_subset = np.random.choice(full_df['user_id'].unique(), size=10000)

In [None]:
sub_df = full_df.loc[full_df['user_id'].isin(user_id_subset)]

In [None]:
len(sub_df) / len(full_df)

In [None]:
sub_df.to_csv('goodreads_interactions_subsample.csv')

In [None]:
sub_df = pd.read_csv('goodreads_interactions_subsample.csv')

### Join with other data

#### Read Book DF

In [None]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [None]:
full_data = load_data('goodreads_books.json.gz', head=None)

In [None]:
def create_book_df(data):
    df = pd.DataFrame.from_records(data)
    df = df.loc[~df['description'].eq('')]
    df = df[[
        'text_reviews_count',
        'country_code',
        'is_ebook',
        'average_rating',
        'description',
        'format',
        'num_pages',
        'publication_month',
        'publication_year',
        'ratings_count',
        'title',
        'book_id'
    ]]
    
    df.loc[df['num_pages'] == '', 'num_pages'] = np.nan
    categorical_imp = SimpleImputer(missing_values='', strategy='most_frequent')
    numerical_imp = SimpleImputer(strategy='mean')
    categorical_columns = ['format', 'publication_month', 'publication_year']
    df[categorical_columns] = categorical_imp.fit_transform(df[categorical_columns])
    numerical_columns = ['num_pages']
    df[numerical_columns] = numerical_imp.fit_transform(df[numerical_columns])
    
    return df

In [None]:
book_df = create_book_df(full_data)

In [None]:
book_df.to_csv('book_df.csv')

In [None]:
book_df = pd.read_csv('full_book_df.csv')

In [None]:
len(book_df)

In [None]:
book_df.eq('').any()

In [None]:
book_df.isna().any()

#### Read Book ID DF

In [None]:
len(sub_df)

In [None]:
sub_df = sub_df.drop('Unnamed: 0', axis=1)

In [None]:
sub_df.head()

In [None]:
book_id_map = pd.read_csv('book_id_map.csv')

In [None]:
len(book_id_map)

In [None]:
book_id_map.head()

In [None]:
max(book_id_map['book_id'])

In [None]:
merge_df = sub_df.merge(book_id_map, left_on='book_id', right_on='book_id_csv', suffixes=[None, '_join'])

In [None]:
merge_df.head()

In [None]:
len(merge_df)

In [None]:
merge_df.isna().any()

In [None]:
merge_df['book_id_join'].value_counts()

In [None]:
merge_df_2 = merge_df.merge(book_df, left_on='book_id_join', right_on='book_id', suffixes=[None, '_x'])

In [None]:
len(merge_df_2)

In [None]:
merge_df_2.isna().any()

In [None]:
len(merge_df_2['book_id'].unique())

In [None]:
merge_df_2 = merge_df_2.drop(['book_id', 'book_id_csv', 'book_id_x'], axis=1)

In [None]:
merge_df_2 = merge_df_2.rename(columns={'book_id_join': 'book_id'})

In [None]:
merge_df_2 = merge_df_2.reset_index(drop=True)

In [None]:
len(merge_df_2)

In [None]:
merge_df_2.to_csv('train_data.csv', index=False)

# Step 2

In [None]:
train_df = pd.read_csv('train_data.csv')

In [None]:
train_df.head()

In [None]:
emb_model = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True)

In [None]:
vectorizer = Vectorizer()

In [None]:
vectorizer.run(list(train_df['description']))

In [None]:
vectors = vectorizer.vectors

In [None]:
type(vectors)

In [None]:
len(vectors)

In [None]:
train_df.loc[train_df['user_id'] == 25]

In [None]:
len(train_df['user_id'])

In [None]:
num_users = len(train_df['user_id'].unique())

In [None]:
num_books = len(train_df['book_id'].unique())

In [None]:
book_id_to_embedding_id = dict(zip(train_df['book_id'].unique(), range(len(train_df['book_id'].unique()))))

In [None]:
book_id_to_embedding_id

In [None]:
train_df.iloc[0]

In [None]:
# Should top k featurize this
train_df['format'].value_counts().head(10)

In [None]:
book_embedding_df = train_df.copy()

In [None]:
book_embedding_df = book_embedding_df.drop_duplicates(subset='book_id').reset_index(drop=True)

In [None]:
book_embedding_df = book_embedding_df.drop(['user_id', 'country_code', 'format'], axis=1)

In [None]:
book_embedding_df['is_ebook'] = book_embedding_df['is_ebook'].apply(lambda x: int(x))

In [None]:
book_embedding_df.columns

In [None]:
book_embedding_df.iloc[0]

In [None]:
decade_bins = [0, 1989, 1999, 2009, 2019, float('inf')]  # Define the bins for each decade
decade_labels = ['Before 1990', '1990-1999', '2000-2009', '2010-2019', '2020 and onwards']
book_embedding_df['publication_decade'] = pd.cut(book_embedding_df['publication_year'], bins=decade_bins, labels=decade_labels, right=False)

In [None]:
book_embedding_df = book_embedding_df.drop(['publication_year'], axis=1)

In [None]:
book_embedding_df = pd.get_dummies(book_embedding_df, columns=['publication_month', 'publication_decade'])

In [None]:
scaler = MinMaxScaler()
columns_to_normalize = ['rating', 'text_reviews_count', 'average_rating', 'num_pages', 'ratings_count']
book_embedding_df[columns_to_normalize] = scaler.fit_transform(book_embedding_df[columns_to_normalize])

In [None]:
book_vectors = {}

In [None]:
for _, row in book_embedding_df.iterrows():
    book_id = row['book_id']
    description_embed = nlp(row['description']).vector[:64]
    title_embed = nlp(row['title']).vector[:32]
    feature_vec = row.drop(['description', 'title', 'book_id']).values
    book_vec = np.concatenate([feature_vec, description_embed, title_embed], axis=0)
    book_vectors[book_id] = book_vec

In [None]:
book_vec_df = pd.DataFrame(book_vectors)
book_vec_df.to_csv('book_vec_dict.csv', index=False)

In [None]:
new_book_vectors = {book_id_to_embedding_id[k]:v for k, v in book_vectors.items()}

In [None]:
book_features = torch.tensor([list(v) for v in book_vectors.values()])

In [None]:
book_features.shape

## Step 3: Create Embedding Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BookEmbeddingNet(nn.Module):
    def __init__(self, num_books, book_feature_dim, embedding_dim):
        super(BookEmbeddingNet, self).__init__()
#         self.book_embeddings = nn.Embedding(num_books, embedding_dim)
        self.fc1 = nn.Linear(book_feature_dim, embedding_dim)
        self.fc2 = nn.Linear(embedding_dim, num_books)

    def forward(self, book_features):
        # Pass book features through fully connected layers
        hidden_layer = F.relu(self.fc1(book_features))
        output_layer = self.fc2(hidden_layer)
        # Lookup embeddings based on book IDs
#         book_embeddings = self.book_embeddings(torch.arange(num_books).to(book_features.device))
        return output_layer

In [None]:
# Transform Goodreads shelves into interaction data
def get_label_id(book_ids):
    return [book_id_to_embedding_id[book_id] for book_id in book_ids]

grouped_interactions = train_df[['user_id', 'book_id']].groupby('user_id')['book_id'].apply(get_label_id).reset_index()

In [None]:
user_to_index = {user_id: i for i, user_id in enumerate(train_df['user_id'].unique())}

In [None]:
sparse_tensor = torch.zeros(num_users, num_books, dtype=torch.float)

In [None]:
for _, row in grouped_interactions.iterrows():
    user_index = user_to_index[row['user_id']]
    
    for book_index in row['book_id']:
        sparse_tensor[user_index, book_index] = 1.0

In [None]:
sparse_tensor.shape

In [None]:
import torch.optim as optim

In [None]:
model = BookEmbeddingNet(num_books, 121, 128)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
num_epochs = 1
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, row in grouped_interactions.iterrows():
        if (i + 1) > 1:
            break
        
        user_index = user_to_index[row['user_id']]
        labels = sparse_tensor[user_index]
        
        for book_index in row['book_id']:
            book_features = torch.tensor(list(new_book_vectors[book_index]))
            outputs = model(book_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        if (i + 1) % 1 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(grouped_interactions)}], Loss: {running_loss / 100:.4f}')
            running_loss = 0.0

print('Finished Training')

In [None]:
torch.save(model, 'model_test.pt')

In [None]:
embedding_net = torch.load('model_test.pt')

In [None]:
embedding_net.eval()

In [None]:
embedding_net.__dir__()

In [None]:
embedding_net.fc_1

In [None]:
embedding_net.fc_1(torch.tensor(list(book_vectors[9712492])))

In [None]:
hasattr(embedding_net, 'fc1')

In [None]:
torch.tensor(list(book_vectors[9712492])).shape

In [None]:
for _ in embedding_net.named_parameters():
    print(_)