In [None]:
!pip install scikit-learn
# !pip install embeddings
!pip install sent2vec

In [None]:
# Generate embeddings of text features
!python -m spacy download en_core_web_md

In [1]:
import pandas as pd
import numpy as np
import json
import gzip
import spacy
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Step 1

### Subsample full interactions for development

In [None]:
full_df = pd.read_csv('goodreads_interactions.csv')

In [None]:
len(full_df)

In [None]:
full_df.head()

In [None]:
len(full_df['book_id'].unique())

In [None]:
len(full_df['user_id'].unique())

In [None]:
user_id_subset = np.random.choice(full_df['user_id'].unique(), size=10000)

In [None]:
sub_df = full_df.loc[full_df['user_id'].isin(user_id_subset)]

In [None]:
len(sub_df) / len(full_df)

In [None]:
sub_df.to_csv('goodreads_interactions_subsample.csv')

In [None]:
sub_df = pd.read_csv('goodreads_interactions_subsample.csv')

### Join with other data

#### Read Book DF

In [None]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [None]:
full_data = load_data('goodreads_books.json.gz', head=None)

In [None]:
def create_book_df(data):
    df = pd.DataFrame.from_records(data)
    df = df.loc[~df['description'].eq('')]
    df = df[[
        'text_reviews_count',
        'country_code',
        'is_ebook',
        'average_rating',
        'description',
        'format',
        'num_pages',
        'publication_month',
        'publication_year',
        'ratings_count',
        'title',
        'book_id'
    ]]
    
    df.loc[df['num_pages'] == '', 'num_pages'] = np.nan
    categorical_imp = SimpleImputer(missing_values='', strategy='most_frequent')
    numerical_imp = SimpleImputer(strategy='mean')
    categorical_columns = ['format', 'publication_month', 'publication_year']
    df[categorical_columns] = categorical_imp.fit_transform(df[categorical_columns])
    numerical_columns = ['num_pages']
    df[numerical_columns] = numerical_imp.fit_transform(df[numerical_columns])
    
    return df

In [None]:
book_df = create_book_df(full_data)

In [None]:
book_df.to_csv('book_df.csv')

In [4]:
book_df = pd.read_csv('../data/full_book_df.csv')

In [5]:
len(book_df)

1948401

In [6]:
book_df.head()

Unnamed: 0,text_reviews_count,country_code,is_ebook,average_rating,description,format,num_pages,publication_month,publication_year,ratings_count,title,book_id
0,6,US,False,3.23,"Anita Diamant's international bestseller ""The ...",Audio CD,266.579581,10,2001,10,Good Harbor,1333909
1,7,US,False,4.03,Omnibus book club edition containing the Ladie...,Hardcover,600.0,10,1987,140,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",7327624
2,3282,US,False,3.49,Addie Downs and Valerie Adler were eight when ...,Hardcover,368.0,7,2009,51184,Best Friends Forever,6066819
3,7,US,False,4.13,"Relates in vigorous prose the tale of Aeneas, ...",Paperback,162.0,9,2006,46,The Aeneid for Boys and Girls,287141
4,6,US,False,4.22,"To Kara's astonishment, she discovers that a p...",Paperback,216.0,3,2009,98,All's Fairy in Love and War (Avalon: Web of Ma...,6066812


In [7]:
len(book_df['book_id'].unique())

1948401

In [None]:
book_df.eq('').any()

In [None]:
book_df.isna().any()

#### Read Book ID DF

In [None]:
len(sub_df)

In [None]:
sub_df = sub_df.drop('Unnamed: 0', axis=1)

In [None]:
sub_df.head()

In [9]:
book_id_map = pd.read_csv('../data/book_id_map.csv')

In [10]:
len(book_id_map)

2360650

In [13]:
book_id_map.head()

Unnamed: 0,book_id_csv,book_id
0,0,34684622
1,1,34536488
2,2,34017076
3,3,71730
4,4,30422361


In [14]:
max(book_id_map['book_id'])

36530431

In [None]:
merge_df = sub_df.merge(book_id_map, left_on='book_id', right_on='book_id_csv', suffixes=[None, '_join'])

In [None]:
merge_df.head()

In [None]:
len(merge_df)

In [None]:
merge_df.isna().any()

In [None]:
merge_df['book_id_join'].value_counts()

In [None]:
merge_df_2 = merge_df.merge(book_df, left_on='book_id_join', right_on='book_id', suffixes=[None, '_x'])

In [None]:
len(merge_df_2)

In [None]:
merge_df_2.isna().any()

In [None]:
len(merge_df_2['book_id'].unique())

In [None]:
merge_df_2 = merge_df_2.drop(['book_id', 'book_id_csv', 'book_id_x'], axis=1)

In [None]:
merge_df_2 = merge_df_2.rename(columns={'book_id_join': 'book_id'})

In [None]:
merge_df_2 = merge_df_2.reset_index(drop=True)

In [None]:
len(merge_df_2)

In [None]:
merge_df_2.to_csv('train_data.csv', index=False)

# Step 2

In [15]:
train_df = pd.read_csv('../data/train_data.csv')

In [16]:
train_df.head()

Unnamed: 0,user_id,is_read,rating,is_reviewed,book_id,text_reviews_count,country_code,is_ebook,average_rating,description,format,num_pages,publication_month,publication_year,ratings_count,title
0,25,0,0,0,9712492,1597,US,False,4.0,"Struggling with her father's death, Nina meets...",Paperback,360.0,11,2010,23694,"Providence (Providence, #1)"
1,11959,1,4,0,9712492,1597,US,False,4.0,"Struggling with her father's death, Nina meets...",Paperback,360.0,11,2010,23694,"Providence (Providence, #1)"
2,12681,0,0,0,9712492,1597,US,False,4.0,"Struggling with her father's death, Nina meets...",Paperback,360.0,11,2010,23694,"Providence (Providence, #1)"
3,13228,0,0,0,9712492,1597,US,False,4.0,"Struggling with her father's death, Nina meets...",Paperback,360.0,11,2010,23694,"Providence (Providence, #1)"
4,13502,0,0,0,9712492,1597,US,False,4.0,"Struggling with her father's death, Nina meets...",Paperback,360.0,11,2010,23694,"Providence (Providence, #1)"


In [None]:
emb_model = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True)

In [None]:
vectorizer = Vectorizer()

In [None]:
vectorizer.run(list(train_df['description']))

In [None]:
vectors = vectorizer.vectors

In [None]:
type(vectors)

In [None]:
len(vectors)

In [17]:
train_df.loc[train_df['user_id'] == 25]

Unnamed: 0,user_id,is_read,rating,is_reviewed,book_id,text_reviews_count,country_code,is_ebook,average_rating,description,format,num_pages,publication_month,publication_year,ratings_count,title
0,25,0,0,0,9712492,1597,US,False,4.00,"Struggling with her father's death, Nina meets...",Paperback,360.000000,11,2010,23694,"Providence (Providence, #1)"
95,25,0,0,0,11505797,21970,US,True,4.14,The new Abby Abernathy is a good girl. She doe...,Kindle Edition,319.000000,5,2011,425898,"Beautiful Disaster (Beautiful, #1)"
736,25,0,0,0,11532160,675,US,False,4.15,Dreaming of the dead might mean a restless nig...,Paperback,266.579581,10,2013,16987,"Requiem (Providence, #2)"
790,25,0,0,0,13417946,772,US,False,4.17,She had seen the unspeakable. She would learn ...,Paperback,276.000000,4,2012,15482,"Eden (Providence, #3)"
835,25,1,5,0,10140661,6213,US,False,4.02,"Enigmatic and sexy, Professor Gabriel Emerson ...",Paperback,506.000000,4,2011,120765,"Gabriel's Inferno (Gabriel's Inferno, #1)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14937,25,0,0,0,10230881,750,US,False,3.75,"Three years after her husband Max's death, She...",Hardcover,297.000000,8,2011,4112,Before Ever After
14961,25,1,5,0,17156082,794,US,True,4.33,Sometimes discovering the truth can leave you ...,Kindle Edition,488.000000,12,2012,10891,"Hopeless (Hopeless, #1)"
14981,25,0,0,0,11561469,4678,US,True,4.23,"""No one tried to get involved with me, and I k...",Kindle Edition,377.000000,5,2011,69366,"Reason to Breathe (Breathing, #1)"
15139,25,1,5,1,9266776,2067,US,False,3.73,"At Coral Tree Prep in Los Angeles, who your pa...",Paperback,295.000000,8,2011,29827,Epic Fail


In [18]:
len(train_df['user_id'])

2587802

In [19]:
num_users = len(train_df['user_id'].unique())

In [20]:
num_books = len(train_df['book_id'].unique())

In [31]:
book_id_to_embedding_id = dict(zip(train_df['book_id'].unique(), range(len(train_df['book_id'].unique()))))

In [None]:
book_id_to_embedding_id

In [None]:
train_df.iloc[0]

In [None]:
# Should top k featurize this
train_df['format'].value_counts().head(10)

In [22]:
interaction_df = train_df.copy()

In [23]:
interaction_df = interaction_df.drop_duplicates(subset='book_id').reset_index(drop=True)

In [24]:
book_embedding_df = book_embedding_df.drop(['user_id', 'country_code', 'format'], axis=1)

In [25]:
book_embedding_df['is_ebook'] = book_embedding_df['is_ebook'].apply(lambda x: int(x))

In [None]:
book_embedding_df.columns

In [None]:
book_embedding_df.iloc[0]

In [26]:
decade_bins = [0, 1989, 1999, 2009, 2019, float('inf')]  # Define the bins for each decade
decade_labels = ['Before 1990', '1990-1999', '2000-2009', '2010-2019', '2020 and onwards']
book_embedding_df['publication_decade'] = pd.cut(book_embedding_df['publication_year'], bins=decade_bins, labels=decade_labels, right=False)

In [27]:
book_embedding_df = book_embedding_df.drop(['publication_year'], axis=1)

In [28]:
book_embedding_df = pd.get_dummies(book_embedding_df, columns=['publication_month', 'publication_decade'])

In [29]:
scaler = MinMaxScaler()
columns_to_normalize = ['rating', 'text_reviews_count', 'average_rating', 'num_pages', 'ratings_count']
book_embedding_df[columns_to_normalize] = scaler.fit_transform(book_embedding_df[columns_to_normalize])

In [34]:
book_vectors = {}

In [35]:
nlp = spacy.load("en_core_web_md")

In [None]:
for _, row in book_embedding_df.iterrows():
    book_id = row['book_id']
    description_embed = nlp(row['description']).vector[:64]
    title_embed = nlp(row['title']).vector[:32]
    feature_vec = row.drop(['description', 'title', 'book_id']).values
    book_vec = np.concatenate([feature_vec, description_embed, title_embed], axis=0)
    book_vectors[book_id_to_embedding_id[book_id]] = book_vec

In [None]:
book_vec_df = pd.DataFrame(book_vectors)
book_vec_df.to_csv('book_vec_dict.csv', index=False)

In [None]:
new_book_vectors = {book_id_to_embedding_id[k]:v for k, v in book_vectors.items()}

In [None]:
book_features = torch.tensor([list(v) for v in book_vectors.values()])

In [None]:
book_features.shape

## Step 3: Create Embedding Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BookEmbeddingNet(nn.Module):
    def __init__(self, num_books, book_feature_dim, embedding_dim):
        super(BookEmbeddingNet, self).__init__()
#         self.book_embeddings = nn.Embedding(num_books, embedding_dim)
        self.fc1 = nn.Linear(book_feature_dim, embedding_dim)
        self.fc2 = nn.Linear(embedding_dim, num_books)

    def forward(self, book_features):
        # Pass book features through fully connected layers
        hidden_layer = F.relu(self.fc1(book_features))
        output_layer = self.fc2(hidden_layer)
        # Lookup embeddings based on book IDs
#         book_embeddings = self.book_embeddings(torch.arange(num_books).to(book_features.device))
        return output_layer

In [None]:
# Transform Goodreads shelves into interaction data
def get_label_id(book_ids):
    return [book_id_to_embedding_id[book_id] for book_id in book_ids]

grouped_interactions = train_df[['user_id', 'book_id']].groupby('user_id')['book_id'].apply(get_label_id).reset_index()

In [None]:
user_to_index = {user_id: i for i, user_id in enumerate(train_df['user_id'].unique())}

In [None]:
sparse_tensor = torch.zeros(num_users, num_books, dtype=torch.float)

In [None]:
for _, row in grouped_interactions.iterrows():
    user_index = user_to_index[row['user_id']]
    
    for book_index in row['book_id']:
        sparse_tensor[user_index, book_index] = 1.0

In [None]:
sparse_tensor.shape

In [None]:
import torch.optim as optim

In [None]:
model = BookEmbeddingNet(num_books, 121, 128)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
num_epochs = 1
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, row in grouped_interactions.iterrows():
        if (i + 1) > 1:
            break
        
        user_index = user_to_index[row['user_id']]
        labels = sparse_tensor[user_index]
        
        for book_index in row['book_id']:
            book_features = torch.tensor(list(new_book_vectors[book_index]))
            outputs = model(book_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        if (i + 1) % 1 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(grouped_interactions)}], Loss: {running_loss / 100:.4f}')
            running_loss = 0.0

print('Finished Training')

In [None]:
torch.save(model, 'model_test.pt')

In [None]:
embedding_net = torch.load('model_test.pt')

In [None]:
embedding_net.eval()

In [None]:
embedding_net.__dir__()

In [None]:
embedding_net.fc_1

In [None]:
embedding_net.fc_1(torch.tensor(list(book_vectors[9712492])))

In [None]:
hasattr(embedding_net, 'fc1')

In [None]:
torch.tensor(list(book_vectors[9712492])).shape

In [None]:
for _ in embedding_net.named_parameters():
    print(_)