## necesssary dependency to install 

In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from collections import Counter
import re
from joblib import Parallel, delayed

##Descriptive Statistics


In [None]:
# basic statistics for textual lengths
import pandas as pd
import numpy as np
from textblob import TextBlob

# Load data from a file (accepting the path)
file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace 'your_dataset.csv' with the actual file path
df = pd.read_csv(file_path)

# Ensure the dataset has a 'headline' column
if 'headline' not in df.columns:
    raise ValueError("The dataset must contain a 'headline' column.")

# Calculate headline lengths (in characters and words)
df['length_characters'] = df['headline'].str.len()
df['length_words'] = df['headline'].str.split().apply(len)

# Basic statistics
stats = {
    'Total Headlines': len(df),
    'Average Length (Characters)': df['length_characters'].mean(),
    # 'Median Length (Characters)': df['length_characters'].median(),
    # 'Standard Deviation (Characters)': df['length_characters'].std(),
    'Average Length (Words)': df['length_words'].mean(),
    # 'Median Length (Words)': df['length_words'].median(),
    # 'Standard Deviation (Words)': df['length_words'].std(),
    'Longest Headline (Characters)': df['length_characters'].max(),
    'Shortest Headline (Characters)': df['length_characters'].min(),
    'Longest Headline (Words)': df['length_words'].max(),
    'Shortest Headline (Words)': df['length_words'].min(),
}

# Print the DataFrame and statistics
print("Headline Data:")
print(df.head())  # Display the first few rows of the dataset
print("\nBasic Statistics:")
for stat, value in stats.items():
    print(f"{stat}: {value}")


In [None]:

file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace 'your_dataset.csv' with the actual file path
df = pd.read_csv(file_path)
# Count articles per publisher
publisher_counts = df['publisher'].value_counts()

# Print articles per publisher
print("Articles per Publisher:")
print(publisher_counts)


In [None]:
# analyze publication dates for trends over time:
if 'date' not in df.columns:
    raise ValueError("The dataset must contain a 'date' column.")
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S", errors='coerce')

# Check for invalid dates
if df['date'].isnull().any():
    print("Warning: Some publication dates could not be parsed and were set to NaT.")
# Extract day of the week and year-month for analysis
df['day_of_week'] = df['date'].dt.day_name()
df['year_month'] = df['date'].dt.to_period('M')

# Count articles by day of the week
articles_by_day = df['day_of_week'].value_counts()

# Count articles by year-month
articles_by_month = df['year_month'].value_counts().sort_index()

# Print the trends
print("Articles by Day of the Week:")
print(articles_by_day)
print("\nArticles by Year-Month:")
print(articles_by_month)


## Text Analysis(Sentiment analysis & Topic Modeling):

In [None]:
# Load data from a CSV file
file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace with the actual path to your file
df = pd.read_csv(file_path)

# Ensure the dataset has a 'headline' column
if 'headline' not in df.columns:
    raise ValueError("The dataset must contain a 'headline' column.")

# Perform sentiment analysis
def analyze_sentiment(headline):
    analysis = TextBlob(headline)
    polarity = analysis.polarity  # Sentiment polarity: -1 (negative) to +1 (positive)
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Apply the sentiment analysis function to the headlines
df['sentiment'] = df['headline'].apply(analyze_sentiment)

# Count the number of headlines by sentiment
sentiment_counts = df['sentiment'].value_counts()

# Print results
print("Sentiment Analysis Results:")
print(sentiment_counts)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from collections import Counter
import re
from joblib import Parallel, delayed

# Load spaCy model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")


In [None]:
# Load the dataset
file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Ensure the dataset has a 'headline' column
if 'headline' not in df.columns:
    raise ValueError("The dataset must contain a 'headline' column.")


In [None]:
# Function to preprocess text (lowercasing, removing punctuation, etc.)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Preprocess all headlines in parallel
df['cleaned_headline'] = Parallel(n_jobs=-1)(delayed(preprocess_text)(text) for text in df['headline'])


In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20, lowercase=True)  # Top 20 keywords
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_headline'])

# Get the top keywords based on TF-IDF scores
keywords = tfidf_vectorizer.get_feature_names_out()
print("Top Keywords from TF-IDF:")
print(keywords)


In [None]:
# Initialize lists to store results
entity_results = []
noun_phrase_results = []

# Batch process headlines using spaCy's nlp.pipe
for doc in nlp.pipe(df['headline'], batch_size=50):  # Adjust batch_size for performance
    # Extract relevant entities
    entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'MONEY', 'DATE', 'PRODUCT']]
    entity_results.append(entities)
    
    # Extract noun phrases
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    noun_phrase_results.append(noun_phrases)

# Add extracted entities and noun phrases back to the dataframe
df['entities'] = entity_results
df['noun_phrases'] = noun_phrase_results


In [None]:
# Combine all entities to get the most common ones
all_entities = [entity for entities in df['entities'] for entity in entities]
entity_counts = Counter(all_entities)

print("\nMost Common Entities:")
for entity, count in entity_counts.most_common(10):  # Top 10 most common entities
    print(f"{entity}: {count}")


In [None]:
# Combine all noun phrases to get the most common ones
all_phrases = [phrase for phrases in df['noun_phrases'] for phrase in phrases]
phrase_counts = Counter(all_phrases)

print("\nMost Common Noun Phrases:")
for phrase, count in phrase_counts.most_common(10):  # Top 10 most common noun phrases
    print(f"{phrase}: {count}")


## Time Series Analysis:

In [None]:
# How does the publication frequency vary over time? 

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace with the actual path to your file
df = pd.read_csv(file_path)

# Ensure the dataset has a 'publication_date' column
if 'date' not in df.columns:
    raise ValueError("The dataset must contain a 'date' column.")

# Convert the 'publication_date' column to datetime format (adjust the column name if necessary)
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with invalid or missing publication dates
df = df.dropna(subset=['date'])

# Extract year, month, and day for time-based analysis
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Step 1: Count the number of publications per month (or any time period you want)
monthly_publications = df.groupby(['year', 'month']).size()

# Plot the number of publications per month
plt.figure(figsize=(10, 6))
monthly_publications.plot(kind='line', marker='o', color='b')
plt.title('Monthly Publication Frequency')
plt.xlabel('Month')
plt.ylabel('Number of Articles Published')
plt.grid(True)
plt.show()

# Step 2: Identify spikes in publication frequency
# Find the months with the largest change in publication frequency
monthly_diff = monthly_publications.diff().abs()  # Difference in number of publications
spikes = monthly_diff[monthly_diff > monthly_diff.quantile(0.95)]  # Top 5% of spikes

print("\nPotential Spikes in Publication Frequency:")
print(spikes)

# If you want to analyze spikes during specific market events, you could cross-reference the spikes with event dates.


In [None]:
# Analysis of publishing times might reveal if there’s a specific time when most news is released, 

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Ensure the dataset has a 'publication_date' column
if 'date' not in df.columns:
    raise ValueError("The dataset must contain a 'date' column.")

# Convert the 'publication_date' column to datetime format (adjust the column name if necessary)
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with invalid or missing publication dates
df = df.dropna(subset=['date'])

# Step 1: Extract the hour of publication
df['hour'] = df['date'].dt.hour

# Step 2: Count the number of publications per hour
hourly_publications = df.groupby('hour').size()

# Step 3: Visualize the distribution of publications by hour
plt.figure(figsize=(10, 6))
hourly_publications.plot(kind='bar', color='b')
plt.title('Publication Frequency by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Articles Published')
plt.xticks(rotation=0)
plt.grid(True)
plt.show()

# Step 4: Identify peaks in the publication frequency
peaks = hourly_publications[hourly_publications == hourly_publications.max()]

print("\nPeak Publication Hours:")
print(peaks)


## Publisher Analysis:

In [None]:
# Which publishers contribute most to the news feed? Is there a difference in the type of news they report?

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

# Load the dataset
file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Ensure the dataset has the necessary columns: 'publisher' and 'headline'
if 'publisher' not in df.columns or 'headline' not in df.columns:
    raise ValueError("The dataset must contain 'publisher' and 'headline' columns.")

# Step 1: Count articles by publisher
publisher_counts = df['publisher'].value_counts()

# Plot the number of articles per publisher
plt.figure(figsize=(12, 6))
publisher_counts.head(10).plot(kind='bar', color='b')  # Top 10 publishers
plt.title('Top 10 Publishers by Number of Articles')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles Published')
plt.xticks(rotation=45, ha='right')
plt.grid(True)
plt.show()

# Step 2: Sentiment Analysis by Publisher
def analyze_sentiment(headline):
    analysis = TextBlob(headline)
    polarity = analysis.polarity  # Sentiment polarity: -1 (negative) to +1 (positive)
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Apply sentiment analysis to the headlines
df['sentiment'] = df['headline'].apply(analyze_sentiment)

# Group by publisher and get sentiment counts
sentiment_by_publisher = df.groupby(['publisher', 'sentiment']).size().unstack().fillna(0)

# Plot sentiment distribution by publisher
sentiment_by_publisher.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Sentiment Distribution by Publisher')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Sentiment')
plt.grid(True)
plt.show()

# Step 3: Keyword Extraction by Publisher (using TF-IDF)
def get_top_keywords_by_publisher(df, publisher_name, top_n=10):
    publisher_data = df[df['publisher'] == publisher_name]
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    X_tfidf = tfidf_vectorizer.fit_transform(publisher_data['headline'])
    return tfidf_vectorizer.get_feature_names_out()

# Get top 10 keywords for the top publisher
top_publisher = publisher_counts.index[0]
top_keywords = get_top_keywords_by_publisher(df, top_publisher)

print(f"Top 10 keywords for {top_publisher}:")
print(top_keywords)

# Step 4: Analyze topic distribution (e.g., using TF-IDF or Named Entity Recognition)
# You can follow a similar method to extract entities or noun phrases for more advanced topic analysis.


In [None]:
# If email addresses are used as publisher names, identify unique domains to see if certain organizations contribute more frequently.

import pandas as pd
import matplotlib.pyplot as plt
import re

# Load the dataset
file_path = "../assets/data/raw_analyst_ratings.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Ensure the dataset has the necessary 'publisher' column
if 'publisher' not in df.columns:
    raise ValueError("The dataset must contain a 'publisher' column.")

# Step 1: Extract domain from email addresses
def extract_domain(email):
    # Use regex to extract the domain part after '@'
    match = re.search(r'@([A-Za-z0-9.-]+)', email)
    if match:
        return match.group(1)
    else:
        return None  # Return None if there's no valid domain

# Apply the extract_domain function to the 'publisher' column
df['domain'] = df['publisher'].apply(extract_domain)

# Step 2: Count occurrences of each domain
domain_counts = df['domain'].value_counts()

# Step 3: Visualize the top domains (top 10)
plt.figure(figsize=(12, 6))
domain_counts.head(10).plot(kind='bar', color='b')
plt.title('Top 10 Domains Contributing Most to the News Feed')
plt.xlabel('Domain (Organization)')
plt.ylabel('Number of Articles Published')
plt.xticks(rotation=45, ha='right')
plt.grid(True)
plt.show()

# Optional: Print the most frequent domains
print("\nTop Domains by Frequency:")
print(domain_counts.head(10))
