In [87]:
import polars as pl

file_path = '../../data/letterboxd_filtered_review.jsonl'


In [88]:
try:
    df = pl.read_ndjson(file_path)
    print(f"Successfully loaded!\n")
except Exception as e:
    print(f"Error loading file: {e}\n")
    exit(1)

Successfully loaded!



In [90]:
print(f"Number of rows: {len(df):,}")
print(f"Memory size: {df.estimated_size('mb'):.2f} MB")


null_counts = df.null_count()
print("\nNumber of NULL values per column:")
for col in df.columns:
    null_count = null_counts[col][0]
    percentage = (null_count / len(df)) * 100
    print(f"  {col:20s}: {null_count:6d} ({percentage:5.2f}%)")

Number of rows: 2,027,869
Memory size: 971.44 MB

Number of NULL values per column:
  title               :      0 ( 0.00%)
  year                :      0 ( 0.00%)
  synopsis            :  48256 ( 2.38%)
  review_text         :      0 ( 0.00%)


In [92]:
df_null_values = df.filter(pl.col('synopsis').is_null()).select('title').unique()

print(f"Number of different movies with NULL synopsis: {len(df_null_values)}")
print(df_null_values.head(10))

Number of different movies with NULL synopsis: 15424
shape: (10, 1)
┌─────────────────────────────────┐
│ title                           │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ Red Hot Chili Peppers - Austin… │
│ Apocalypse                      │
│ 8e étage                        │
│ NADA                            │
│ Máma má raka                    │
│ Yasmina                         │
│ Galinha ao Molho Pardo          │
│ Lost Explorer                   │
│ Quand punir ne suffit pas       │
│ Max & Ruby's Christmas          │
└─────────────────────────────────┘


In [93]:
interesting_cols = ['synopsis', 'review_text']
for col in interesting_cols:
    words = df.filter(pl.col(col).is_not_null()).select(
       pl.col(col).str.split(' ').list.len().alias('word_count')
    )

    stats = words.select(
        pl.col('word_count').min().alias('min'),
        pl.col('word_count').max().alias('max'),
        pl.col('word_count').mean().alias('mean'),
        pl.col('word_count').sum().alias('total'),
    )

    print(f"\n{col}:")
    print(stats)


synopsis:
shape: (1, 4)
┌─────┬─────┬───────────┬──────────┐
│ min ┆ max ┆ mean      ┆ total    │
│ --- ┆ --- ┆ ---       ┆ ---      │
│ u32 ┆ u32 ┆ f64       ┆ u32      │
╞═════╪═════╪═══════════╪══════════╡
│ 1   ┆ 203 ┆ 46.926372 ┆ 92896057 │
└─────┴─────┴───────────┴──────────┘

review_text:
shape: (1, 4)
┌─────┬─────┬───────────┬──────────┐
│ min ┆ max ┆ mean      ┆ total    │
│ --- ┆ --- ┆ ---       ┆ ---      │
│ u32 ┆ u32 ┆ f64       ┆ u32      │
╞═════╪═════╪═══════════╪══════════╡
│ 1   ┆ 166 ┆ 34.523416 ┆ 70008965 │
└─────┴─────┴───────────┴──────────┘


In [94]:
short_synopsis = df.filter(
    pl.col('synopsis').is_not_null() &
    (pl.col('synopsis').str.split(' ').list.len() <= 10)
).select(['title', 'synopsis', 'review_text']).unique(subset=['title'])

print(f"Movies with short synopses: {len(short_synopsis)}")
for row in short_synopsis.head().iter_rows(named=True):
    print(f"\nTitle: {row['title']}")
    print(f"Synopsis: {row['synopsis']}")
    print("-" * 80)

Movies with short synopses: 16683

Title: Process Red
Synopsis: An experimental short film by Hollis Frampton of contrasting colours.
--------------------------------------------------------------------------------

Title: In Ur Eye
Synopsis: A short documentary on the gentrification of Hackney.
--------------------------------------------------------------------------------

Title: Ko-Ko the Kid
Synopsis: Koko the Clown seeks the Fountain of Youth.
--------------------------------------------------------------------------------

Title: Just Having Fun at a Grueling Job
Synopsis: Many stickmans working and playing with a rubbish paperball.
--------------------------------------------------------------------------------

Title: Ration Blues
Synopsis: A Soundie with Louis Jordan and his Orchestra.
--------------------------------------------------------------------------------


In [95]:
short_review = df.filter(
    pl.col('review_text').is_not_null() &
    (pl.col('review_text').str.split(' ').list.len() <= 10)
).select(['title', 'synopsis', 'review_text'])

print(f"Movies with short review: {len(short_review)}")
for row in short_review.head().iter_rows(named=True):
    print(f"\nTitle: {row['title']}")
    print(f"Review: {row['review_text']}")
    print("-" * 80)


Movies with short review: 589765

Title: Come and See
Review: What a horrible nightmare!
--------------------------------------------------------------------------------

Title: Come and See
Review: (guy who's still buzzing from Spider-Man: Across the Spider-Verse)
--------------------------------------------------------------------------------

Title: Come and See
Review: Playing this at grandma’s bingo party next week.
--------------------------------------------------------------------------------

Title: Seven Samurai
Review: too many sweaty ass cheeks, 5 stars
--------------------------------------------------------------------------------

Title: Seven Samurai
Review: this is what the avengers think they look like
--------------------------------------------------------------------------------
