In [1]:
# Load the already combined dataset
import pandas as pd

# Load the final combined dataset
final_df = pd.read_parquet('workspace/data/hackernews_full_data.parquet')

print(f"Loaded dataset: {len(final_df):,} rows")
print(f"Columns: {list(final_df.columns)}")
print(f"Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"Date range: {final_df['time'].min()} to {final_df['time'].max()}")
print(f"Score range: {final_df['score'].min()} to {final_df['score'].max()}")

final_df.head()

Loaded dataset: 4,902,536 rows
Columns: ['id', 'by', 'time', 'url', 'score', 'title', 'descendants']
Memory usage: 1568.9 MB
Date range: 2006-10-09 19:21:51 to 2024-10-13 23:53:00
Score range: -1 to 6015


Unnamed: 0,id,by,time,url,score,title,descendants
0,1,pg,2006-10-09 19:21:51,http://ycombinator.com,57,Y Combinator,15.0
1,2,phyllis,2006-10-09 19:30:28,http://www.paulgraham.com/mit.html,16,A Student's Guide to Startups,0.0
2,3,phyllis,2006-10-09 19:40:33,http://www.foundersatwork.com/stevewozniak.html,7,Woz Interview: the early days of Apple,0.0
3,4,onebeerdave,2006-10-09 19:47:42,http://avc.blogs.com/a_vc/2006/10/the_nyc_deve...,5,NYC Developer Dilemma,0.0
4,5,perler,2006-10-09 19:51:04,http://www.techcrunch.com/2006/10/09/google-yo...,7,"Google, YouTube acquisition announcement could...",0.0


In [3]:

# Convert SQL query to pandas operations
# Original SQL filters and selects specific columns with calculated fields

# Filter the data (note: type='story' and by IS NOT NULL already applied during data loading)
filtered_df = final_df[
    (final_df['score'] >= 0)  # score >= 0 filter
].copy()

# Add calculated columns
filtered_df['title_length'] = filtered_df['title'].str.len()
filtered_df['title_word_count'] = filtered_df['title'].str.strip().str.split().str.len()

# Select desired columns and sort by time descending
result_df = filtered_df[['id', 'score', 'title', 'title_length', 'title_word_count']]

print(f"Filtered dataset: {len(result_df):,} rows")
print("\nSample results:")
result_df.head(10)

Filtered dataset: 4,902,535 rows

Sample results:


Unnamed: 0,id,score,title,title_length,title_word_count
0,1,57,Y Combinator,12,2
1,2,16,A Student's Guide to Startups,29,5
2,3,7,Woz Interview: the early days of Apple,38,7
3,4,5,NYC Developer Dilemma,21,3
4,5,7,"Google, YouTube acquisition announcement could...",59,7
5,6,4,Business Intelligence the Inkling Way: cool pr...,71,9
6,7,5,Sevin Rosen Unfunds - why?,26,5
7,8,10,LikeBetter featured by BBC,26,4
8,9,4,weekendr: social network for the weekend,40,6
9,10,3,PhotoShow: Broadcast Photos to Cable TV,39,6
