In [49]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
import contractions
from nltk.corpus import stopwords
import pickle

STOP_WORDS = set(stopwords.words('english'))

In [44]:
# Load the data
df1 = pd.read_csv('articles1.csv', index_col=0)
df2 = pd.read_csv('articles2.csv', index_col=0)
df3 = pd.read_csv('articles3.csv', index_col=0)

# Concatenate the data files
df = pd.concat([df1, df2, df3])

In [45]:
# Drop unnecessary columns
df = df.drop(columns=['title', 'date', 'year', 'month', 'url'])

# Drop rows with NaN publication or author
df = df.dropna(subset=['publication', 'author'], how='any').reset_index()

In [46]:
def clean_content(row):
    """
    For each row in the data, clean the content of the article
    by removing contractions, lowercasing the text, removing special characters,
    removing stopwords, and lemmatizing the words.
    :param row: pd.Series
    """
    print(f'{row.name}\r', end='')
    content = row['content']
    
    # Remove contractions
    content = contractions.fix(content)
    
    # Lowercase the text
    content = content.lower()
    
    # Remove special characters
    content = content.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    
    # Remove stopwords
    content = ' '.join([word for word in content.split() if word not in STOP_WORDS])
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    content = ' '.join([lemmatizer.lemmatize(word) for word in content.split()])
    
    row['cleaned_content'] = content

In [47]:
# Apply the cleaning function to the data
df.apply(clean_content, axis=1)
df.to_csv('cleaned_articles.csv')

126693