In [1]:
import torch
import altair as alt
import polars as pl
from vega_datasets import data
pl.Config.set_tbl_cols(20)

polars.config.Config

In [2]:
# Import a dataset from polars
df = pl.scan_parquet(
    "/Users/khangnghiem/Filen/data/datasets/jniimi/tripadvisor-review-rating/data/train-00000-of-00001.parquet",
)
cols = [
    "hotel_id",
    "user_id",
    "title",
    "text",
    "overall",
    "cleanliness",
    "value",
    "location",
    "rooms",
    "sleep_quality",
    "stay_year",
    "post_date",
    "review",
    "char",
]
df = df.select(cols).collect()
df.schema

Schema([('hotel_id', Int64),
        ('user_id', String),
        ('title', String),
        ('text', String),
        ('overall', Float64),
        ('cleanliness', Float64),
        ('value', Float64),
        ('location', Float64),
        ('rooms', Float64),
        ('sleep_quality', Float64),
        ('stay_year', Int64),
        ('post_date', Datetime(time_unit='ns', time_zone=None)),
        ('review', String),
        ('char', Int64)])

In [3]:
df.sort("post_date", descending=True)


hotel_id,user_id,title,text,overall,cleanliness,value,location,rooms,sleep_quality,stay_year,post_date,review,char
i64,str,str,str,f64,f64,f64,f64,f64,f64,i64,datetime[ns],str,i64
147779618,"""C5AAB7A57D8D4987C953D8E8500669…","""A Kimpton Classic""","""The Monaco in Denver epitomize…",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-12-20 00:00:00,"""A Kimpton Classic The Monaco i…",270
147765131,"""3AAEB9513C4857D62E6F84452050A5…","""Big Rooms, central location""","""stayed for 3 nights for work a…",4.0,3.0,4.0,4.0,3.0,4.0,2012,2012-12-20 00:00:00,"""Big Rooms, central location st…",276
147780061,"""6A2444D955085DC92F107897FB2198…","""Great Hotel; Great Location""","""We stayed at The River Inn in …",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-12-20 00:00:00,"""Great Hotel; Great Location We…",430
147767791,"""1DE75FA343DB9DDE99B70089937968…","""Michigan Avenue/Grant Park""","""Great location, right on Michi…",3.0,3.0,3.0,4.0,3.0,4.0,2012,2012-12-20 00:00:00,"""Michigan Avenue/Grant Park Gre…",1650
147768078,"""4D4F5E731C6F36026CE4CE0E987167…","""Satisfactory Accomodations wit…","""This hotel is conveniently loc…",4.0,5.0,5.0,5.0,5.0,5.0,2012,2012-12-20 00:00:00,"""Satisfactory Accomodations wit…",486
…,…,…,…,…,…,…,…,…,…,…,…,…,…
122370775,"""F7D55C04DF8108FDC4083EAF68C527…","""Kimpton hospitality, great loc…","""This was our second experience…",5.0,4.0,5.0,4.0,5.0,5.0,2012,2012-01-01 00:00:00,"""Kimpton hospitality, great loc…",1385
122370165,"""0451ABA2D90A7915D78DC4CA4238DA…","""Outstanding Columbus Hotel""","""We were thrilled during a one …",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-01-01 00:00:00,"""Outstanding Columbus Hotel We …",2049
122374890,"""AC5F1278AB138E324DDD45F8CB2D88…","""Very Nice Stay""","""We stayed here on the last day…",4.0,5.0,5.0,4.0,5.0,5.0,2012,2012-01-01 00:00:00,"""Very Nice Stay We stayed here …",739
122374096,"""E727FDAA92FE4EC3465AF9EE9B0D0A…","""an affordable, hip hotel""","""This hotel is very hip but it …",4.0,3.0,4.0,3.0,4.0,3.0,2012,2012-01-01 00:00:00,"""an affordable, hip hotel This …",792


In [4]:
df = df.with_columns(
    pl.concat_str(
        [pl.col("title"), pl.col("text"), pl.col("review")], separator=". "
    ).alias("full_text")
)
df

hotel_id,user_id,title,text,overall,cleanliness,value,location,rooms,sleep_quality,stay_year,post_date,review,char,full_text
i64,str,str,str,f64,f64,f64,f64,f64,f64,i64,datetime[ns],str,i64,str
127781101,"""2262DCBFC351F42A9DD30AC8BAD246…","""Really excellent Hilton""","""Stayed here on business trips …",5.0,4.0,5.0,4.0,5.0,4.0,2012,2012-04-13 00:00:00,"""Really excellent Hilton Stayed…",204,"""Really excellent Hilton. Staye…"
137380592,"""8477E11DABF4D6743885E401BB4C8C…","""Exceptional service and comfor…","""Spent two nights here for a we…",5.0,5.0,4.0,5.0,4.0,5.0,2012,2012-08-16 00:00:00,"""Exceptional service and comfor…",621,"""Exceptional service and comfor…"
129673371,"""483A193B7113ADFFD5CE30849564F6…","""Nice room and five star servic…","""Great place for a 3-night stay…",5.0,5.0,5.0,3.0,5.0,4.0,2012,2012-05-09 00:00:00,"""Nice room and five star servic…",1259,"""Nice room and five star servic…"
129006626,"""E5A63DD7239A7057746D4644A5C986…","""BRILLIANT hotel, my #1 Chicago…","""This is my favorite hotel in C…",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-04-28 00:00:00,"""BRILLIANT hotel, my #1 Chicago…",2242,"""BRILLIANT hotel, my #1 Chicago…"
139168159,"""CBFE281C9386225267BC52518836A6…","""Convenient and comfortable""","""BEST. BREAKFAST. EVER. Couldn'…",5.0,5.0,4.0,5.0,4.0,5.0,2012,2012-09-02 00:00:00,"""Convenient and comfortable BES…",511,"""Convenient and comfortable. BE…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
135811676,"""F343B7C4978A928494222A63EA3325…","""Great find in cool neighborhoo…","""Although a native NY'er, I hav…",4.0,4.0,4.0,5.0,4.0,4.0,2012,2012-07-30 00:00:00,"""Great find in cool neighborhoo…",1545,"""Great find in cool neighborhoo…"
129553333,"""AF4C4D7F38A7CB053D226AE7954294…","""Exceptional Service and great …","""I have stayed here a couple of…",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-05-07 00:00:00,"""Exceptional Service and great …",230,"""Exceptional Service and great …"
143681514,"""C88ACB857117608D118C91D56F7953…","""Beware of the Rip Off!""","""I received a call for a two da…",2.0,2.0,1.0,4.0,2.0,1.0,2012,2012-10-25 00:00:00,"""Beware of the Rip Off! I recei…",572,"""Beware of the Rip Off!. I rece…"
139622044,"""39EE66822F41168E3867C9DBED8108…","""It Deserves the Rating""","""I really like Kimptons. Every …",5.0,5.0,4.0,5.0,5.0,5.0,2012,2012-09-07 00:00:00,"""It Deserves the Rating I reall…",746,"""It Deserves the Rating. I real…"


In [17]:
df[:10000,:]
df[5000:10000]

hotel_id,user_id,title,text,overall,cleanliness,value,location,rooms,sleep_quality,stay_year,post_date,review,char,full_text
i64,str,str,str,f64,f64,f64,f64,f64,f64,i64,datetime[ns],str,i64,str
142913865,"""77FF109932C95C1D93FEE2395AFA57…","""Best service I have ever had""","""I wanted to make sure that som…",5.0,5.0,5.0,1.0,5.0,5.0,2012,2012-10-15 00:00:00,"""Best service I have ever had I…",954,"""Best service I have ever had. …"
126384064,"""9A02AA8684EFEDE42B73CE22F16EAC…","""Excellent Stay - Highly recomm…","""I stayed here for 2 nights in …",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-03-20 00:00:00,"""Excellent Stay - Highly recomm…",347,"""Excellent Stay - Highly recomm…"
135274051,"""DB737A29579A493058C0B234DD3A0B…","""Great Hotel, great location!""","""We arrived very early on the m…",5.0,5.0,4.0,5.0,5.0,5.0,2012,2012-07-24 00:00:00,"""Great Hotel, great location! W…",525,"""Great Hotel, great location!. …"
128009145,"""EB529259C0CE702F90FEC00EB3A852…","""Excellent hotel in great locat…","""Stayed here 31st March for fou…",5.0,5.0,4.0,5.0,5.0,5.0,2012,2012-04-16 00:00:00,"""Excellent hotel in great locat…",1102,"""Excellent hotel in great locat…"
126675234,"""B660303E13948BBEE736D03A5E329F…","""All round good hotel""","""Hi, The hotel staff are friend…",5.0,4.0,4.0,4.0,4.0,4.0,2012,2012-03-26 00:00:00,"""All round good hotel Hi, The h…",122,"""All round good hotel. Hi, The …"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
127355094,"""39A868831A88A991DD47311FB4AFB6…","""Classy yet comfortable""","""The Fairmont is a classy hotel…",5.0,5.0,4.0,5.0,5.0,5.0,2012,2012-04-06 00:00:00,"""Classy yet comfortable The Fai…",756,"""Classy yet comfortable. The Fa…"
128462402,"""CAF25F8A5F001532323A0B614F994C…","""Blackwell at OSU""","""Stayed for two nights to visit…",5.0,5.0,5.0,5.0,4.0,5.0,2012,2012-04-23 00:00:00,"""Blackwell at OSU Stayed for tw…",418,"""Blackwell at OSU. Stayed for t…"
137366233,"""43E4C479BDD8E2C6AF94351B402FA2…","""Such a wonderful hotel and gre…","""I conducted a meeting at this …",5.0,5.0,4.0,5.0,5.0,5.0,2012,2012-08-16 00:00:00,"""Such a wonderful hotel and gre…",339,"""Such a wonderful hotel and gre…"
135927676,"""394E560D484070ADDD53FC2DE50B69…","""Clean & friendly staff!""","""We are from Montana so we expe…",4.0,5.0,4.0,3.0,5.0,5.0,2012,2012-07-31 00:00:00,"""Clean & friendly staff! We are…",383,"""Clean & friendly staff!. We ar…"


In [6]:
def add_one(x):
    return x + 1
df.with_columns(
    pl.col("value").map_batches(lambda x: add_one(x), return_dtype=pl.Float64)
)

hotel_id,user_id,title,text,overall,cleanliness,value,location,rooms,sleep_quality,stay_year,post_date,review,char,full_text
i64,str,str,str,f64,f64,f64,f64,f64,f64,i64,datetime[ns],str,i64,str
127781101,"""2262DCBFC351F42A9DD30AC8BAD246…","""Really excellent Hilton""","""Stayed here on business trips …",5.0,4.0,6.0,4.0,5.0,4.0,2012,2012-04-13 00:00:00,"""Really excellent Hilton Stayed…",204,"""Really excellent Hilton. Staye…"
137380592,"""8477E11DABF4D6743885E401BB4C8C…","""Exceptional service and comfor…","""Spent two nights here for a we…",5.0,5.0,5.0,5.0,4.0,5.0,2012,2012-08-16 00:00:00,"""Exceptional service and comfor…",621,"""Exceptional service and comfor…"
129673371,"""483A193B7113ADFFD5CE30849564F6…","""Nice room and five star servic…","""Great place for a 3-night stay…",5.0,5.0,6.0,3.0,5.0,4.0,2012,2012-05-09 00:00:00,"""Nice room and five star servic…",1259,"""Nice room and five star servic…"
129006626,"""E5A63DD7239A7057746D4644A5C986…","""BRILLIANT hotel, my #1 Chicago…","""This is my favorite hotel in C…",5.0,5.0,6.0,5.0,5.0,5.0,2012,2012-04-28 00:00:00,"""BRILLIANT hotel, my #1 Chicago…",2242,"""BRILLIANT hotel, my #1 Chicago…"
139168159,"""CBFE281C9386225267BC52518836A6…","""Convenient and comfortable""","""BEST. BREAKFAST. EVER. Couldn'…",5.0,5.0,5.0,5.0,4.0,5.0,2012,2012-09-02 00:00:00,"""Convenient and comfortable BES…",511,"""Convenient and comfortable. BE…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
135811676,"""F343B7C4978A928494222A63EA3325…","""Great find in cool neighborhoo…","""Although a native NY'er, I hav…",4.0,4.0,5.0,5.0,4.0,4.0,2012,2012-07-30 00:00:00,"""Great find in cool neighborhoo…",1545,"""Great find in cool neighborhoo…"
129553333,"""AF4C4D7F38A7CB053D226AE7954294…","""Exceptional Service and great …","""I have stayed here a couple of…",5.0,5.0,6.0,5.0,5.0,5.0,2012,2012-05-07 00:00:00,"""Exceptional Service and great …",230,"""Exceptional Service and great …"
143681514,"""C88ACB857117608D118C91D56F7953…","""Beware of the Rip Off!""","""I received a call for a two da…",2.0,2.0,2.0,4.0,2.0,1.0,2012,2012-10-25 00:00:00,"""Beware of the Rip Off! I recei…",572,"""Beware of the Rip Off!. I rece…"
139622044,"""39EE66822F41168E3867C9DBED8108…","""It Deserves the Rating""","""I really like Kimptons. Every …",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-09-07 00:00:00,"""It Deserves the Rating I reall…",746,"""It Deserves the Rating. I real…"


In [18]:
from transformers import AutoTokenizer, AutoModel
import torch
import asyncio
import os

# Initialize model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embeddings(text):
    # Tokenize
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=512,  # BERT's max token limit
        return_tensors="pt",
    )

    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Use [CLS] token embedding as sentence representation
    embedding = outputs.last_hidden_state[:, 0, :].tolist()[0]

    return embedding


# Create temp directory if it doesn't exist
os.makedirs("./temp", exist_ok=True)

# Split df into batches of size 10000
batch_size = 10000
num_batches = (len(df) + batch_size - 1) // batch_size
print(num_batches)
for i in range(num_batches):
    batch = df[i * batch_size : (i + 1) * batch_size]
    
    # Perform get_embeddings for each batch
    batch = batch.with_columns(
        text_embeddings=pl.col("full_text").map_elements(
            get_embeddings, return_dtype=pl.List(pl.Float64)
        )
    )
    
    # Save each batch to the temp folder
    batch.write_parquet(f"./temp/batch_{i}.parquet")

21


In [15]:
len(text_embeddings[0])

768

In [None]:
df = df.with_columns(
    pl.Series("text_embeddings", text_embeddings).cast(pl.List(pl.Float64))
)

In [22]:
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

# Extract numerical features (example columns)
numerical_cols = ["cleanliness", "value", "location", "rooms", "sleep_quality"]
numerical_data = df.select(numerical_cols).to_numpy()
numerical_tensor = torch.tensor(numerical_data, dtype=torch.float32)


class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim),
        )

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

# Hyperparameters
input_dim = 775  # BERT (768) + numerical (7)
latent_dim = 64
epochs = 10

# Initialize
model = Autoencoder(input_dim, latent_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

dataloader = torch.utils.data.DataLoader(
    df, batch_size=64, shuffle=True
)
# Training loop
for epoch in range(epochs):
    for batch in dataloader:
        inputs = batch[0]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'polars.dataframe.frame.DataFrame'>