In [None]:
import polars as pl

file_path = '../../data/letterboxd_filtered_review.jsonl'


In [None]:
try:
    df = pl.read_ndjson(file_path)
    print(f"Successfully loaded!\n")
except Exception as e:
    print(f"Error loading file: {e}\n")
    exit(1)

In [None]:
print(f"Number of rows: {len(df):,}")
print(f"Memory size: {df.estimated_size('mb'):.2f} MB")


null_counts = df.null_count()
print("\nNumber of NULL values per column:")
for col in df.columns:
    null_count = null_counts[col][0]
    percentage = (null_count / len(df)) * 100
    print(f"  {col:20s}: {null_count:6d} ({percentage:5.2f}%)")

In [None]:
df_null_values = df.filter(pl.col('synopsis').is_null()).select('title').unique()

print(f"Number of different movies with NULL synopsis: {len(df_null_values)}")
print(df_null_values.head(10))

In [None]:
interesting_cols = ['synopsis', 'review_text']
for col in interesting_cols:
    words = df.filter(pl.col(col).is_not_null()).select(
       pl.col(col).str.split(' ').list.len().alias('word_count')
    )

    stats = words.select(
        pl.col('word_count').min().alias('min'),
        pl.col('word_count').max().alias('max'),
        pl.col('word_count').mean().alias('mean'),
        pl.col('word_count').sum().alias('total'),
    )

    print(f"\n{col}:")
    print(stats)

In [None]:
short_synopsis = df.filter(
    pl.col('synopsis').is_not_null() &
    (pl.col('synopsis').str.split(' ').list.len() <= 10)
).select(['title', 'synopsis', 'review_text']).unique(subset=['title'])

print(f"Movies with short synopses: {len(short_synopsis)}")
for row in short_synopsis.head().iter_rows(named=True):
    print(f"\nTitle: {row['title']}")
    print(f"Synopsis: {row['synopsis']}")
    print("-" * 80)

In [None]:
short_review = df.filter(
    pl.col('review_text').is_not_null() &
    (pl.col('review_text').str.split(' ').list.len() <= 10)
).select(['title', 'synopsis', 'review_text'])

print(f"Movies with short review: {len(short_review)}")
for row in short_review.head().iter_rows(named=True):
    print(f"\nTitle: {row['title']}")
    print(f"Review: {row['review_text']}")
    print("-" * 80)
