In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load full dataset into Pandas dataframe 
df = pd.read_csv('all_articles_merged.csv')


In [None]:
# Show first five rows 
df.head(5)


In [None]:
# Save tag categories 
tag_column = df['tag']
print(tag_column)

In [None]:
# Filter for unique tags
unique_tags = df['tag'].unique()

# Print the unique values
print("Unique tags:", unique_tags)

In [None]:
# Get the number of rows using len
num_rows = len(df)

print("Number of rows:", num_rows)

In [None]:
# Show number of null values for text_content column which contains article texts

null_count = df['text_content'].isnull().sum()
print("Number of null values in 'textcontent' column:", null_count)

In [None]:
# Print the rows with null values in 'textcontent' column

null_rows = df[df['text_content'].isnull()]
print("Rows with null values in 'textcontent' column:")
print(null_rows)

In [None]:
# Drop null values for text_content column 
df_null= df.dropna(subset=['text_content'])

# Print the cleaned DataFrame
print("DataFrame after dropping rows with null values in 'textcontent' column:")
print(df_null)

In [None]:
#Check for duplicates 

duplicates = df_null.duplicated()
num_duplicates = duplicates.sum()

print(f"Number of duplicate rows: {num_duplicates}")

In [None]:
# Drop duplicates 
df_unique = df_null.drop_duplicates()
num_rows = len(df_unique)
print("Number of rows:", num_rows)

In [None]:
df_cleaned = df_unique.dropna(subset=['text_content'])
df_original = df_cleaned.copy()


In [None]:
# Save cleaned dataframe to csv file 

df_cleaned.to_csv('cleaned_wiredfull.csv', index=False)

In [None]:
num_rows = len(df_cleaned)
print("Number of rows:", num_rows)

In [None]:
# Plot the histogram for the 'year' column to show yearly distibution of artices 

plt.figure(figsize=(10, 6))
plt.hist(df['year'], bins=range(df_cleaned['year'].min(), df_cleaned['year'].max() + 2), edgecolor='black', alpha=0.7)
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.title('Yearly Distribution of Articles')
plt.xticks(range(df_cleaned['year'].min(), df_cleaned['year'].max() + 1))
plt.grid(True)
plt.savefig('histogram_articles_per_year.png')
plt.show()

In [None]:
articles_per_year = df_cleaned['year'].value_counts().sort_index()
print("Number of articles per year:")
print(articles_per_year)

In [None]:
plt.savefig('histogram_articles_per_year.png')
plt.show()

In [None]:
def count_words(text):
    # Split text into words and count them
    words = text.split()
    return len(words)

# Apply the function to each row in 'textcontent' column
df_cleaned['word_count'] = df_cleaned['text_content'].apply(count_words)

# Calculate mean article length
mean_article_length = df_cleaned['word_count'].mean()
print("\nMean article length:", mean_article_length)

In [None]:
null_count = df_cleaned['tag'].isnull().sum()
print("Number of null values in 'tag' column:", null_count)

In [None]:
# Fill null tag values as Unlabelled 
df_pie = df_cleaned.copy()
df_pie['tag'] = df_pie['tag'].fillna('Unlabelled')
print(df_pie)

In [None]:
category_counts = df_pie['tag'].value_counts()


In [None]:
# Create a count plot for each tag
plt.figure(figsize=(12, 8))
sns.countplot(data=df_pie, x='year', hue='tag')

# Set plot title and labels
plt.title('Distribution of Tags Across Years')
plt.xlabel('Years')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Tag')

# Adjust layout
plt.tight_layout()

# Save the plot to a file
plt.savefig('tags_distribution_across_years.png', dpi=300)
plt.show()

In [None]:
# group for top tags per year 

top_tags_per_year = df_pie.groupby(['year', 'tag']).size().reset_index(name='count')
top_tags_per_year = top_tags_per_year.sort_values(['year', 'count'], ascending=[True, False])

plt.figure(figsize=(14, 8))
sns.barplot(data=top_tags_per_year, x='year', y='count', hue='tag', dodge=False)
plt.title('Top Tags by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Tag')
plt.xticks(rotation=45)
plt.savefig('top_tags_across_years.png', dpi=300)
plt.show()

In [None]:
top_tags_per_year = df_pie.groupby(['year', 'tag']).size().reset_index(name='count')

# Define the number of top tags to display per year
top_n = 5

# Get the top N tags per year
top_tags_per_year = top_tags_per_year.sort_values(['year', 'count'], ascending=[True, False])
top_tags_per_year = top_tags_per_year.groupby('year').head(top_n).reset_index(drop=True)

# Set the figure size
plt.figure(figsize=(14, 8))
sns.barplot(data=top_tags_per_year, x='year', y='count', hue='tag', dodge=False)

# Set plot title and labels with larger font size
plt.title('Top Tags by Year', fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xticks(rotation=45, fontsize=14)
plt.yticks(fontsize=14)
plt.legend(title='Tag', fontsize=14, title_fontsize=16)

plt.tight_layout()
plt.savefig('top_tags_by_year2.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Import all necessary packaged for pre-processing 

import nltk
import numpy as np
from nltk import word_tokenize #Import word_tokenize function from NLTK
from nltk.corpus import stopwords #Import the stop words lists from NLTK
import string  # Import the string module
from string import punctuation 
import seaborn as sns


In [None]:
nltk.download('stopwords')


In [None]:
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
corpus=[]
new= df_cleaned['title'].str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]

from collections import defaultdict
from collections import Counter


dic=defaultdict(int)
for word in corpus:
    if word in stop_words:
        dic[word]+=1

In [None]:
def plot_top_stopwords_barchart(text):
    stop_words = stopwords.words('english')

    new= text.str.split()
    new=new.values.tolist()
    corpus=[word for i in new for word in i]
    from collections import defaultdict
    dic=defaultdict(int)
    for word in corpus:
        if word in stop_words:
            dic[word]+=1
            
    top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    x,y=zip(*top)
    plt.bar(x,y)

In [None]:
print(stop_words)

In [None]:
plot_top_stopwords_barchart(df_cleaned['title'])

In [None]:
print(stop)

In [None]:
counter=Counter(corpus)
most=counter.most_common()

x, y= [], []

for word,count in most[:40]:
    if (word not in stop_words):
        x.append(word)
        y.append(count)

sns.barplot(x=y,y=x)

In [1]:
def plot_top_non_stopwords_barchart(text):
    stop = set(stopwords.words('english'))
    
    translator = str.maketrans('', '', string.punctuation)
    text_nopunct = text.apply(lambda x: x.translate(translator))
    
    new = text_nopunct.str.split()
    new = new.values.tolist()
    corpus = [word for i in new for word in i]

    # Filter out stopwords from corpus
    corpus = [word.lower() for word in corpus if word.lower() not in stop]

    # Count frequencies of non-stopwords
    counter = Counter(corpus)
    most = counter.most_common()
    x, y = [], []
    top_words = [word for word, count in most[:20]]  # Top 20 non-stopwords


    sns.barplot(x=[count for word, count in most[:20]], y=top_words, palette='viridis')
    sns.set(rc={'figure.figsize':(10, 6)})  # Adjust figure size if necessary
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.title('Top 20 words in Wired Headlines')
    
    plt.savefig('top_words_barplot.png', dpi=300, bbox_inches='tight')  # Adjust filename and parameters as needed

    plt.show()
    
    return top_words

In [None]:
print(top_words)

In [None]:
def print_titles_with_top_words(df, top_words):
    print("Titles containing top words:")
    top_words_set = set(word.lower() for word in top_words)
    
    for title in df_cleaned['title']:
        title_words = set(title.lower().split())
        if top_words_set & title_words:
            print(title)
    print("\n")


In [None]:
top_words = plot_top_non_stopwords_barchart(df_cleaned['title'])

In [None]:
# Print titles containing top words
print_titles_with_top_words(df_cleaned, top_words)

In [None]:
plot_top_non_stopwords_barchart(df_cleaned['title'])


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from nltk.stem import WordNetLemmatizer


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
import contractions

In [None]:
stop_words = stopwords.words('english')

print(stop_words)

In [None]:
# Append 'said' and 'say' to stopword list because they have a high frequency 

stop_words.append('said')
stop_words.append('say')

In [None]:
# Function for preprocessing text data for LDA pipeline 

def preprocess_text(text):
    
    
    text = text.lower()
    text = re.sub("<[^>]+>|\[.*?\]|div|br|span", ' ', text) #remove html tags 
    text = re.sub(r'<a\s+href=".*?">(.*?)</a>', r'\1', text) #remove links 
    text = re.sub(r"\.\.\.", " ", text)
    text = re.sub(r'\b\d+\b', " ", text) #remove digits 
    text = re.sub(r'(\w+)-(\w+)', r'\1\2', text) #remove hyphen and join words
    text = re.sub(r'\s+', ' ', text).strip() 
    text = re.sub(r'\bus\b', 'us', text)


    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Remove punctuation and digits
    tokens = [token for token in tokens if token.isalpha()]  # Remove tokens that contain non-alphabetic characters
    
    # Remove stop words
    stop_words = stopwords.words('english')
    stop_words += ['say']
    stop_words += ['said']



    tokens = [token for token in tokens if token not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatisation 
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [None]:
df_cleaned['text_content'] = df_cleaned['text_content'].apply(preprocess_text)
#df_original['text_content'] = df_original['text_content'].apply(preprocess_text)




In [None]:
# Final check for null values to ensure none are found 

null_values = df_cleaned['text_content'].isnull().sum()

if null_values > 0:
    print(f"There are {null_values} null values in the cleaned text column.")
else:
    print("No null values found in the cleaned text column.")

In [None]:
all_processed_text = ' '.join(df_cleaned['text_content'])
#all_processed_text = ' '.join(df_original['text_content'])

In [None]:
# Save to CSV file 
df_cleaned["text_content"].to_csv('tryclean.csv', index = False)

In [None]:
tokens = word_tokenize(all_processed_text)


In [None]:
corpus_size = len(all_processed_text.split())
print(corpus_size)

In [None]:
vocabulary_size = len(set(all_processed_text.split()))
print(f"Vocabulary size (number of unique tokens): {vocabulary_size}")


In [None]:
# Plot comparison of text length (via token count) of original and processed data 

original_lengths = [len(text.split()) for text in df_zero['text_content']]
processed_lengths = [len(text.split()) for text in df_cleaned['text_content']]

# Plotting
plt.figure(figsize=(10, 6))

plt.hist(original_lengths, bins=30, alpha=0.5, label='Original Text')
plt.hist(processed_lengths, bins=30, alpha=0.5, label='Processed Text')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Text Length Comparison')
plt.legend()
plt.savefig('processingcomparison.png', bbox_inches='tight')


plt.show()