In [None]:
# 1️⃣ Reload original CSV
combined_df_nodup = pd.read_csv('/Users/jingguo/Desktop/OPT/NLP/combined_articles.csv')

# 2️⃣ Remove exact duplicate title + body
combined_df_nodup = combined_df_nodup.drop_duplicates(subset=['title', 'body']).reset_index(drop=True)

# 3️⃣ Fill missing publisher
combined_df_nodup['publisher'] = combined_df_nodup['publisher'].fillna('Other publishers not NYT')

# 4️⃣ Convert length to numeric → parse "2121 words" or pure numbers
combined_df_nodup['length_num'] = combined_df_nodup['length'].astype(str).str.extract(r'(\d+)').astype(float)

# 5️⃣ Apply consistent length >= 100 filter to ALL articles
combined_df_nodup = combined_df_nodup[combined_df_nodup['length_num'] >= 100].reset_index(drop=True)

# 6️⃣ Remove rows where title == 'No Headline In Original'
combined_df_nodup = combined_df_nodup[combined_df_nodup['title'] != 'No Headline In Original'].reset_index(drop=True)

# 7️⃣ Remove rows where section contains 'podcast' (case-insensitive)
combined_df_nodup = combined_df_nodup[~combined_df_nodup['section'].str.contains('podcast', case=False, na=False)].reset_index(drop=True)

# 8️⃣ Remove duplicate body rows
combined_df_nodup = combined_df_nodup.drop_duplicates(subset=['body']).reset_index(drop=True)

# 9️⃣ Remove rows where title contains 'The Ezra Klein Show'
combined_df_nodup = combined_df_nodup[~combined_df_nodup['title'].str.contains('The Ezra Klein Show', case=True, na=False)].reset_index(drop=True)

# 🔟 Remove rows where title starts with 'Transcript' or 'Full Transcript'
combined_df_nodup = combined_df_nodup[~combined_df_nodup['title'].str.startswith(('Transcript', 'Full Transcript'), na=False)].reset_index(drop=True)

# 11️⃣ Remove rows where title starts with 'The Listings: '
combined_df_nodup = combined_df_nodup[~combined_df_nodup['title'].str.startswith('The Listings: ', na=False)].reset_index(drop=True)

# 12️⃣ Remove rows where title == 'The Listings' (exact match)
combined_df_nodup = combined_df_nodup[combined_df_nodup['title'] != 'The Listings'].reset_index(drop=True)

# 13️⃣ Remove rows where (title matches Notable Books or starts with BOOKS FOR VACATION READING) AND length > 10000
combined_df_nodup = combined_df_nodup[
    ~(
        ((combined_df_nodup['title'].str.contains('Notable Books', case=False, na=False)) |
         (combined_df_nodup['title'].str.startswith('BOOKS FOR VACATION READING', na=False)))
        &
        (combined_df_nodup['length_num'] > 10000)
    )
].reset_index(drop=True)

# 14️⃣ Drop the helper column 'length_num' — no need to keep it
combined_df_nodup = combined_df_nodup.drop(columns=['length_num'])

# 15️⃣ Save final clean CSV
combined_df_nodup.to_csv('/Users/jingguo/Desktop/OPT/NLP/combined_articles_cleaned.csv', index=False)

# 16️⃣ Print final shape
print(f"\nFinal cleaned shape: {combined_df_nodup.shape}")



Final cleaned shape: (41056, 12)


In [None]:
# Load the final cleaned dataset
combined_df_cleaned = pd.read_csv('/Users/jingguo/Desktop/OPT/NLP/combined_articles_cleaned.csv')

# Check shape to confirm
print(f"Loaded cleaned dataset shape: {combined_df_cleaned.shape}")

# Optional: Check first few rows
combined_df_cleaned.head()


Loaded cleaned dataset shape: (41056, 12)


Unnamed: 0,title,date,section,length,body,source_file,publisher,publish_date,edition,section_info,author,url
0,The New York Times,September 01st 2024,Section A; Column 0; National Desk; Pg. 15,6500,Democrats hoped to lose by less in blue-collar...,1.DOCX,The New York Times,"November 19, 2024",Late Edition - Final,Section A; Column 0; National Desk; Pg. 15,Katie Glueck,https://www.nytimes.com/2024/11/18/us/politics...
1,A Real Working-Class Hero,"December 13, 2024",Section A; Column 0; Editorial Desk; Pg. 25,628,One of the more moving stories in The Times th...,1.DOCX,The New York Times,"December 13, 2024",Late Edition - Final,Section A; Column 0; Editorial Desk; Pg. 25,Bret Stephens,https://www.nytimes.com/live/2024/12/04/opinio...
2,Working Class Proves Elusive For Democrats,"November 2, 2024",Section A; Column 0; National Desk; Pg. 1,1737,Kamala Harris's plans offer a bigger boost for...,1.DOCX,The New York Times,"November 2, 2024",Late Edition - Final,Section A; Column 0; National Desk; Pg. 1,Jeanna Smialek,https://www.nytimes.com/2024/11/01/business/ec...
3,Strong Showing Spurs Midwest Mechanic to Empow...,"November 21, 2024",Section A; Column 0; National Desk; Pg. 14,958,"Mr. Osborn, the industrial mechanic who turned...",1.DOCX,The New York Times,"November 21, 2024",Late Edition - Final,Section A; Column 0; National Desk; Pg. 14,Jonathan Weisman,https://www.nytimes.com/2024/11/19/us/politics...
4,Is This the End of the White Working-Class Dem...,"November 18, 2024",US; politics,1640,Democrats hoped to lose by less in blue-collar...,1.DOCX,The New York Times,"November 18, 2024",,US; politics,,


In [None]:
# 1️⃣ Check publishers preserved
print("\nPublisher breakdown:")
print(combined_df_cleaned['publisher'].value_counts())

# 2️⃣ Check length >= 100
combined_df_cleaned['length_num'] = combined_df_cleaned['length'].astype(str).str.extract(r'(\d+)').astype(float)
print("\nNumber of rows with length < 100:", (combined_df_cleaned['length_num'] < 100).sum())

# 3️⃣ Check 'No Headline In Original' removed
print("\n'No Headline In Original' rows:", (combined_df_cleaned['title'] == 'No Headline In Original').sum())

# 4️⃣ Check 'podcast' in section removed
print("\nRows with 'podcast' in section:", combined_df_cleaned['section'].str.contains('podcast', case=False, na=False).sum())

# 5️⃣ Check 'The Ezra Klein Show' removed
print("\nRows with 'The Ezra Klein Show' in title:", combined_df_cleaned['title'].str.contains('The Ezra Klein Show', case=True, na=False).sum())

# 6️⃣ Check Transcript/Full Transcript titles removed
print("\nRows with title starting with 'Transcript' or 'Full Transcript':",
      combined_df_cleaned['title'].str.startswith(('Transcript', 'Full Transcript'), na=False).sum())

# 7️⃣ Check 'The Listings:' and exact 'The Listings' removed
print("\nRows with title starting with 'The Listings:':",
      combined_df_cleaned['title'].str.startswith('The Listings: ', na=False).sum())

print("Rows with title == 'The Listings':",
      (combined_df_cleaned['title'] == 'The Listings').sum())

# 8️⃣ Check Notable Books / BOOKS FOR VACATION READING with length > 10000 removed
print("\nRows with 'Notable Books' and length > 10000:",
      ((combined_df_cleaned['title'].str.contains('Notable Books', case=False, na=False)) &
       (combined_df_cleaned['length_num'] > 10000)).sum())

print("Rows with 'BOOKS FOR VACATION READING' and length > 10000:",
      ((combined_df_cleaned['title'].str.startswith('BOOKS FOR VACATION READING', na=False)) &
       (combined_df_cleaned['length_num'] > 10000)).sum())



Publisher breakdown:
publisher
The New York Times                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    26668
Other publishers not NYT                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [None]:
combined_df_cleaned.to_csv('/Users/jingguo/Desktop/OPT/NLP/combined_articles_cleaned_utf8.csv',
                           index=False,
                           encoding='utf-8',
                           quoting=1)  # csv.QUOTE_ALL


In [None]:
combined_df_cleaned.to_excel('/Users/jingguo/Desktop/OPT/NLP/combined_articles_cleaned.xlsx', index=False)
