In [9]:
import sqlite3
import pandas as pd

In [2]:
# File path to the .db file
db_path = "all-the-news.db"

In [3]:
# Connect to the database
conn = sqlite3.connect(db_path)

In [4]:
# Query to retrieve all table names
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(query).fetchall()

In [5]:
# Print the list of tables
print("Tables in the database:")
for table in tables:
    print(table[0])

Tables in the database:
longform


In [6]:
# Query to inspect the schema of the table
table_name = "longform"
query = f"PRAGMA table_info({table_name});"
columns = conn.execute(query).fetchall()

In [7]:
# Print the column names and types
print(f"Columns in table '{table_name}':")
for column in columns:
    print(f"Column: {column[1]}, Type: {column[2]}")

Columns in table 'longform':
Column: id, Type: 
Column: title, Type: TEXT
Column: author, Type: TEXT
Column: date, Type: TEXT
Column: content, Type: TEXT
Column: year, Type: INTEGER
Column: month, Type: INTEGER
Column: publication, Type: TEXT
Column: category, Type: TEXT
Column: digital, Type: INTEGER
Column: section, Type: TEXT
Column: url, Type: TEXT


In [10]:
# Query the entire table
query = "SELECT * FROM longform;"
data = pd.read_sql_query(query, conn)

In [11]:
csv_path = "all-the-news.csv"

In [12]:
# Export to CSV
data.to_csv(csv_path, index=False)

In [13]:
# Close the connection
conn.close()

In [15]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


Data Cleaning and Preprocessing

In [14]:
df = pd.read_csv("all-the-news.csv")

  df = pd.read_csv("all-the-news.csv")


In [16]:
df.head()

Unnamed: 0,id,title,author,date,content,year,month,publication,category,digital,section,url
0,1,Agent Cooper in Twin Peaks is the audience: on...,\nTasha Robinson\n,2017-05-31,And never more so than in Showtime’s new...,2017.0,5.0,Verge,Longform,1.0,,
1,2,"AI, the humanity!",\nSam Byford\n,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017.0,5.0,Verge,Longform,1.0,,
2,3,The Viral Machine,\nKaitlyn Tiffany\n,2017-05-25,Super Deluxe built a weird internet empi...,2017.0,5.0,Verge,Longform,1.0,,
3,4,How Anker is beating Apple and Samsung at thei...,\nNick Statt\n,2017-05-22,Steven Yang quit his job at Google in th...,2017.0,5.0,Verge,Longform,1.0,,
4,5,Tour Black Panther’s reimagined homeland with ...,\nKwame Opam\n,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017.0,5.0,Verge,Longform,1.0,,


In [20]:
df = df[['date', 'title', 'content', 'publication']]

In [21]:
df = df.drop_duplicates()

In [22]:
df = df.dropna(subset=['title', 'content'])

In [24]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Convert 'date' with error handling
df['title'] = df['title'].astype('string')
df['content'] = df['content'].astype('string')
df['publication'] = df['publication'].astype('string')

In [25]:
df = df.dropna(subset=['date'])

In [30]:
import re

def clean_text(text):
    # Remove leading/trailing whitespace
    text = text.strip()
    # Remove extra spaces between words
    text = ' '.join(text.split())
    # Remove line breaks
    text = text.replace("\n", " ").replace("\r", "")
    # Remove punctuation using regex
    text = re.sub(r"[^\w\s.!?]", "", text)  # Retains ., !, ?
    return text

In [31]:
df['title'] = df['title'].apply(clean_text)
df['content'] = df['content'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = df['content'].apply(clean_text)


In [29]:
print(df.info())
print(df.head(2))

<class 'pandas.core.frame.DataFrame'>
Index: 179309 entries, 0 to 204132
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   date         179309 non-null  datetime64[ns]
 1   title        179309 non-null  object        
 2   content      179309 non-null  object        
 3   publication  179309 non-null  string        
dtypes: datetime64[ns](1), object(2), string(1)
memory usage: 6.8+ MB
None
        date                                              title  \
0 2017-05-31  Agent Cooper in Twin Peaks is the audience onc...   
1 2017-05-30                                    AI the humanity   

                                             content publication  
0  And never more so than in Showtimes new series...       Verge  
1  AlphaGos victory isnt a defeat for humans  its...       Verge  


In [12]:
df.to_csv('cleaned_news_data.csv', index=False, encoding='utf-8')

In [13]:
df_clean = pd.read_csv("cleaned_news_data.csv")

In [15]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584295 entries, 0 to 1584294
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   date         1584295 non-null  object
 1   title        1584295 non-null  object
 2   article      1584123 non-null  object
 3   publication  1584295 non-null  object
dtypes: object(4)
memory usage: 48.3+ MB


In [16]:
df_clean.head()

Unnamed: 0,date,title,article,publication
0,2016-12-09 18:31:00,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",Vox
1,2016-10-07 21:26:46,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the hi...,Business Insider
2,2018-01-26 00:00:00,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",Reuters
3,2019-06-27 00:00:00,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,Reuters
4,2016-01-27 00:00:00,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,TMZ
