In [15]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
# Import local functions
from general_functions import *

In [16]:
# Import data
reghub_data_link = onedrive_download("https://1drv.ms/u/s!AoiE7xOoBAsngsgsIpu8x82sG1hvtw?e=mDKP33")
df = pd.read_csv(reghub_data_link)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 5, saw 5


In [8]:
df.info()

NameError: name 'df' is not defined

In [None]:
df.describe()

## Data Cleaning

In [None]:
# Count the number of empty values in 'new_content'
empty_count = df['news_content'].isna().sum()

print(f"Number of empty values in 'news_content': {empty_count}")

In [None]:
# Create a new DataFrame without rows where 'new_content' is empty
df = df.dropna(subset=['news_content'])


In [None]:
# Assuming 'concepts' is the column containing the concepts
keyword_to_remove = "market microstructure"
df = df[~df['concepts'].str.contains(keyword_to_remove, case=False, na=False)]

## Preliminary EDA

In [None]:
# Calculate the frequency of each category
category_counts = df['news_type'].value_counts()

# Create a bar chart
plt.figure(figsize=(8, 6))
category_counts.plot(kind='bar')
plt.title('Frequency of Categories')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Calculate the frequency of each category
source_counts = df['source_name'].value_counts()

# Create a bar chart
plt.figure(figsize=(8, 6))
source_counts.plot(kind='bar')
plt.title('Frequency of Sources')
plt.xlabel('Source')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Calculate the frequency of each category
source_counts = df['source_name'].value_counts()

# Select the top 10 sources
top_sources = source_counts.head(10)

# Create a bar chart for the top 10 sources
plt.figure(figsize=(8, 6))
top_sources.plot(kind='bar')
plt.title('Top 10 Sources')
plt.xlabel('Source')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
print(source_counts.head(20))

In [None]:
df["sentiment_score"].plot()

In [None]:
print("Sentiment score equals to zero " + str((df['sentiment_score'
                                        ] == 0).sum()) + " times.")
print("General dataset is " + str(len(df)) + " entries long.")

In [None]:
# Calculate the frequency of each category
language_counts = df['language'].value_counts()

# Create a bar chart
plt.figure(figsize=(8, 6))
language_counts.plot(kind='bar')
plt.title('Frequency of Languages')
plt.xlabel('Language')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
def extract_description(row):
    try:
        if isinstance(row, str):
            data = json.loads(row)
            return data[0]['description']
        else:
            return None
    except (json.JSONDecodeError, IndexError):
        return None

df['news_topic_formatted'] = df['news_topic'].apply(extract_description)

# Calculate the frequency of each topic
topic_counts = df['news_topic_formatted'].value_counts()

# Select the top 20 topics
top_topics = topic_counts.head(20)

# Create a bar chart for the top 20 topics
plt.figure(figsize=(8, 6))
top_topics.plot(kind='bar')
plt.title('Top 10 Topics')
plt.xlabel('Topic')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


In [None]:
print(topic_counts.head(20))

In [None]:
# Convert the date column to a datetime type
df['news_date'] = pd.to_datetime(df['news_date'])

# Extract the date component (day) and count the number of news articles per day
daily_news_count = df['news_date'].dt.date.value_counts().sort_index()

# Create a bar chart
plt.figure(figsize=(12, 6))
daily_news_count.plot(kind='bar', width=0.8)
plt.title('Daily News Count')
plt.xlabel('Date')
plt.ylabel('Number of News Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates


# Extract the date component (day) and count the number of news articles per day
daily_news_count = df['news_date'].dt.date.value_counts().sort_index()

# Create a bar chart
plt.figure(figsize=(12, 6))
daily_news_count.plot(kind='bar', width=0.8)

# Set the major locator on Mondays
mondays = mdates.WeekdayLocator(mdates.MONDAY)
plt.gca().xaxis.set_major_locator(mondays)

# Formatting for date on x-axis
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

plt.title('Daily News Count')
plt.xlabel('Date')
plt.ylabel('Number of News Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Statistics

In [None]:
# Number of characters in each headlines
df['news_title'].str.len().hist()

In [None]:
def plot_word_number_histogram(text):
    # Exclude NaN values
    text = text.dropna()

    # Calculate the length of each word and plot the histogram
    word_lengths = text.str.split().map(lambda x: len(x))
    plt.hist(word_lengths, bins=20, edgecolor='black')
    plt.title('Word Length Histogram')
    plt.xlabel('Word Length')
    plt.ylabel('Frequency')
    plt.show()

# Assuming df is your DataFrame
plot_word_number_histogram(df['news_title'])

In [None]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
nltk.download('stopwords')

# Define stopwords for English and German
stop_en = set(stopwords.words('english'))
stop_de = set(stopwords.words('german'))

# Filter DataFrame for English and German content
df_en = df[df['language'] == 'en']
df_de = df[df['language'] == 'de']

In [None]:
# Function to process the corpus, remove stopwords, and create a bar plot
def process_and_plot(df, stop, language):
    corpus = [word for text in df['news_content'].str.split() for word in text]
    
    # Remove stopwords
    corpus = [word for word in corpus if word not in stop]
    
    # Count the occurrences of words
    counter = Counter(corpus)
    
    # Get the most common words
    most_common = counter.most_common()[:40]

    # Separate x and y values for plotting
    x, y = zip(*most_common)

    # Create a bar plot using Seaborn
    sns.barplot(x=y, y=x)
    
    # Add labels and title
    plt.xlabel('Word Count')
    plt.ylabel('Words')
    plt.title(f'Most Common Words in {language} News Content')
    
    # Show the plot
    plt.show()

In [None]:
# Process and plot for English content
process_and_plot(df_en, stop_en, 'English')

# Process and plot for German content
process_and_plot(df_de, stop_de, 'German')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import gensim
import pyLDAvis
import pyLDAvis.gensim
from collections import defaultdict

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define stopwords for English and German
stop_en = set(stopwords.words('english'))
stop_de = set(stopwords.words('german'))

In [None]:
def get_lda_objects(text, language='english'):
    stop = stop_en if language == 'english' else stop_de

    def _preprocess_text(text):
        corpus = []
        lem = WordNetLemmatizer()
        for news in text:
            # Check for NaN values
            if pd.notna(news):
                words = [w for w in word_tokenize(news) if (w not in stop)]
                words = [lem.lemmatize(w) for w in words if len(w) > 2]
                corpus.append(words)
        return corpus

    corpus = _preprocess_text(text)

    dic = gensim.corpora.Dictionary(corpus)
    bow_corpus = [dic.doc2bow(doc) for doc in corpus]

    lda_model = gensim.models.LdaMulticore(bow_corpus,
                                           num_topics=6,
                                           id2word=dic,
                                           passes=10,
                                           workers=2)

    return lda_model, bow_corpus, dic

In [None]:
def plot_lda_vis(lda_model, bow_corpus, dic):
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
    return vis

In [None]:
title_en = df_en['news_title'].tolist()
title_de = df_de['news_title'].tolist()

In [None]:
lda_model_en, bow_corpus_en, dic_en = get_lda_objects(title_en, language='english')
vis_en_t = plot_lda_vis(lda_model_en, bow_corpus_en, dic_en)

lda_model_de, bow_corpus_de, dic_de = get_lda_objects(title_de, language='german')
vis_de_t = plot_lda_vis(lda_model_de, bow_corpus_de, dic_de)

In [None]:
vis_en_t

In [None]:
vis_de_t

In [None]:
content_en = df_en['news_content'].tolist()
content_de = df_de['news_content'].tolist()

In [None]:
lda_model_en, bow_corpus_en, dic_en = get_lda_objects(content_en, language='english')
vis_en_c = plot_lda_vis(lda_model_en, bow_corpus_en, dic_en)

lda_model_de, bow_corpus_de, dic_de = get_lda_objects(content_de, language='german')
vis_de_c = plot_lda_vis(lda_model_de, bow_corpus_de, dic_de)

In [None]:
vis_en_c

In [None]:
vis_de_c