In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
def detect_spikes(df, column='price_change', threshold=2):
    mean = df[column].mean()
    std = df[column].std()
    print(f"Mean of {column}: {mean}")
    print(f"Standard deviation of {column}: {std}")

    df['z_score'] = (df[column] - mean) / std
    df['is_spike'] = df['z_score'].abs() > threshold

    print(f"Number of rows where is_spike is True: {df['is_spike'].sum()}")
    print(f"Number of rows where is_spike is False: {(~df['is_spike']).sum()}")

    return df


In [None]:

# Load the data
df = pd.read_csv('AAPL_Events_and_News_Data.csv')


In [None]:
print("Original dataframe shape:", df.shape)
print("\nFirst few rows of the original dataframe:")
print(df.head())

Original dataframe shape: (250, 10)

First few rows of the original dataframe:
  ticker  event_date  price_change spike   news_date  \
0   AAPL  2023-01-13           3.7   yes  2023-01-12   
1   AAPL  2021-03-03          -4.5   yes  2021-03-02   
2   AAPL  2021-04-13          -4.2   yes  2021-04-12   
3   AAPL  2022-05-02           0.8    no  2022-05-01   
4   AAPL  2022-04-11          -0.7    no  2022-04-10   

                                            headline  \
0          Apple Announces Stock Split (Simulated 1)   
1  Apple Secures Major Government Contract (Simul...   
2          Apple Announces Stock Split (Simulated 3)   
3      Apple Introduces New AI Feature (Simulated 4)   
4      Apple Introduces New AI Feature (Simulated 5)   

                                             content sentiment  \
0  Apple announces a 4-for-1 stock split, making ...   neutral   
1  Apple has won a multi-million dollar contract ...  positive   
2  Apple announces a 4-for-1 stock split, making 

In [None]:
print("\nData types of the columns:")
print(df.dtypes)

print("\nChecking for missing values:")
print(df.isnull().sum())


Data types of the columns:
ticker           object
event_date       object
price_change    float64
spike            object
news_date        object
headline         object
content          object
sentiment        object
keywords         object
entity_count      int64
dtype: object

Checking for missing values:
ticker          0
event_date      0
price_change    0
spike           0
news_date       0
headline        0
content         0
sentiment       0
keywords        0
entity_count    0
dtype: int64


In [None]:
# Detect spikes
df = detect_spikes(df, threshold=1.5)

Mean of price_change: 0.9448
Standard deviation of price_change: 3.507924614412574
Number of rows where is_spike is True: 23
Number of rows where is_spike is False: 227


In [None]:
print("\nDataframe after spike detection:")
print(df[['event_date', 'price_change', 'z_score', 'is_spike']].head(10))


Dataframe after spike detection:
   event_date  price_change   z_score  is_spike
0  2023-01-13           3.7  0.785422     False
1  2021-03-03          -4.5 -1.552143      True
2  2021-04-13          -4.2 -1.466622     False
3  2022-05-02           0.8 -0.041278     False
4  2022-04-11          -0.7 -0.468881     False
5  2021-04-13          -0.5 -0.411867     False
6  2021-12-23          -2.1 -0.867978     False
7  2022-01-25          -1.8 -0.782457     False
8  2022-08-09          -3.8 -1.352595     False
9  2022-10-09          -2.8 -1.067526     False


In [None]:
# Print spikes
spike_df = df[df['is_spike']]
print("\nDetected spikes:")
print(spike_df[['event_date', 'price_change', 'z_score']])



Detected spikes:
     event_date  price_change   z_score
1    2021-03-03          -4.5 -1.552143
16   2023-03-13           6.4  1.555108
50   2021-11-02          -4.6 -1.580650
68   2020-03-24          -4.4 -1.523636
71   2022-04-29           6.3  1.526601
90   2021-04-29          -4.6 -1.580650
102  2020-01-12          -5.0 -1.694677
106  2021-10-16          -4.9 -1.666170
122  2019-04-17           6.8  1.669135
126  2020-05-26           6.9  1.697642
153  2022-09-24           6.8  1.669135
158  2021-05-09          -4.5 -1.552143
162  2022-03-03          -4.5 -1.552143
167  2022-02-15           6.5  1.583614
206  2022-08-02           6.9  1.697642
209  2019-01-06           6.8  1.669135
219  2020-08-19           6.8  1.669135
222  2020-04-13          -4.6 -1.580650
223  2019-03-14           6.6  1.612121
236  2019-11-16           6.9  1.697642
242  2019-09-29          -4.5 -1.552143
245  2020-02-02           6.9  1.697642
249  2020-04-11           6.8  1.669135


In [None]:
print(f"\nNumber of spikes detected: {len(spike_df)}")


Number of spikes detected: 23


In [None]:
# Print some statistics
print(f"\nMin price change: {df['price_change'].min():.2f}")
print(f"Max price change: {df['price_change'].max():.2f}")
print(f"Mean price change: {df['price_change'].mean():.2f}")
print(f"Standard deviation of price change: {df['price_change'].std():.2f}")
print(f"Threshold for spike detection (1.5 standard deviations): {1.5 * df['price_change'].std():.2f}")


Min price change: -5.00
Max price change: 6.90
Mean price change: 0.94
Standard deviation of price change: 3.51
Threshold for spike detection (1.5 standard deviations): 5.26


In [None]:
from datetime import datetime, timedelta

In [None]:
def gather_news_around_spikes(df, days=7):
    spike_dates = df[df['is_spike']]['event_date'].tolist()
    relevant_news = []

    for spike_date in spike_dates:
        spike_date = datetime.strptime(spike_date, '%Y-%m-%d')
        start_date = spike_date - timedelta(days=days)
        end_date = spike_date + timedelta(days=days)

        relevant_news.extend(df[(df['news_date'] >= start_date.strftime('%Y-%m-%d')) &
                                (df['news_date'] <= end_date.strftime('%Y-%m-%d'))]['content'].tolist())

    return relevant_news

In [None]:
relevant_news = gather_news_around_spikes(df)

In [None]:
print(f"Number of relevant news articles: {len(relevant_news)}")

Number of relevant news articles: 88


# **Text Preprocessing and Vectorizaton**


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in tokens if word.isalnum() and word not in stop_words])


In [None]:
# Preprocess the relevant news
preprocessed_news = [preprocess_text(news) for news in relevant_news]


In [None]:
# Create TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(preprocessed_news)


In [None]:
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (88, 125)


# **Clustering - DBSCAN**

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix)

In [None]:


# Convert similarity to distance
distance_matrix = np.sqrt(2 * (1 - cosine_sim))

  distance_matrix = np.sqrt(2 * (1 - cosine_sim))


In [None]:
# Check for NaN values
print(f"Number of NaN values in distance matrix: {np.isnan(distance_matrix).sum()}")


Number of NaN values in distance matrix: 55


In [None]:
# Replace NaN values with the maximum distance
max_distance = np.nanmax(distance_matrix)
distance_matrix = np.nan_to_num(distance_matrix, nan=max_distance)

In [None]:
# Verify no NaN values remain
print(f"Number of NaN values after replacement: {np.isnan(distance_matrix).sum()}")


Number of NaN values after replacement: 0


In [None]:
# Run DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=1, metric='precomputed')
clusters = dbscan.fit_predict(distance_matrix)


In [None]:
print(f"Number of clusters: {len(set(clusters)) - (1 if -1 in clusters else 0)}")



Number of clusters: 45


In [None]:
print(f"Unique cluster labels: {np.unique(clusters)}")
print(f"Number of points in each cluster: {np.bincount(clusters + 1)}")
print(f"Shape of distance matrix: {distance_matrix.shape}")
print(f"Min distance: {distance_matrix.min()}, Max distance: {distance_matrix.max()}")

Unique cluster labels: [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
Number of points in each cluster: [19  1  1  1  1  1 11  1  1  1  1  1  1  1  1  1  2 13  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  2  1  1  1  1  1  1  1  1]
Shape of distance matrix: (88, 88)
Min distance: 0.0, Max distance: 1.3520366034210365


# **Cluster Analysis**

In [None]:
from collections import Counter

In [None]:
def analyze_clusters(clusters, news, vectorizer):
    cluster_news = {}
    for i, cluster in enumerate(clusters):
        if cluster != -1:  # Ignore noise points
            if cluster not in cluster_news:
                cluster_news[cluster] = []
            cluster_news[cluster].append(news[i])

    for cluster, texts in cluster_news.items():
        print(f"\nCluster {cluster}:")
        print(f"Number of articles: {len(texts)}")

        # Get top terms for the cluster
        tfidf_cluster = vectorizer.transform(texts)
        feature_array = np.array(vectorizer.get_feature_names_out())
        tfidf_sorting = np.argsort(tfidf_cluster.toarray()).flatten()[::-1]
        top_terms = feature_array[tfidf_sorting][:10]

        print(f"Top terms: {', '.join(top_terms)}")

        # Print a sample headline
        print(f"Sample headline: {texts[0][:100]}...")


In [None]:
# Analyze the clusters
analyze_clusters(clusters, relevant_news, vectorizer)


Cluster 0:
Number of articles: 1
Top terms: us, contract, dollar, government, event, generation, data, purposes, apple, simulated
Sample headline: Apple has won a multi-million dollar contract with the US government. This is a simulated event for ...

Cluster 1:
Number of articles: 1
Top terms: 47, facing, store, legal, europe, app, policies, challenges, event, new
Sample headline: Apple is facing new legal challenges in Europe over its app store policies. This is a simulated even...

Cluster 2:
Number of articles: 1
Top terms: 108, excitement, siri, increased, community, tech, features, event, new, data
Sample headline: Apple's new AI-driven features for Siri have increased excitement in the tech community. This is a s...

Cluster 3:
Number of articles: 1
Top terms: 17, siri, community, features, increased, tech, excitement, event, new, simulated
Sample headline: Apple's new AI-driven features for Siri have increased excitement in the tech community. This is a s...

Cluster 4:
Number

# **To identify most impactful clusters**

In [128]:
def associate_clusters_with_spikes(spike_dates, news_dates, clusters, spike_window=7):
    cluster_spike_map = {i: [] for i in set(clusters)}

    for i, (news_date, cluster) in enumerate(zip(news_dates, clusters)):
        news_date = datetime.strptime(news_date, '%Y-%m-%d')
        for spike_date in spike_dates:
            spike_date = datetime.strptime(spike_date, '%Y-%m-%d')
            if abs((news_date - spike_date).days) <= spike_window:
                cluster_spike_map[cluster].append(spike_date.strftime('%Y-%m-%d'))

    return cluster_spike_map

## **Impact of each cluster based on the number of articles and associated spike dates.**

In [129]:
def calculate_cluster_impact(cluster_spike_map, clusters):
    cluster_impact = {}
    for cluster, spike_dates in cluster_spike_map.items():
        cluster_size = sum(1 for c in clusters if c == cluster)
        impact_score = cluster_size * len(set(spike_dates))
        cluster_impact[cluster] = impact_score
    return cluster_impact


# **Uses the top terms from each cluster to summarize its key theme or cause.**

In [130]:
def extract_key_themes(vectorizer, tfidf_matrix, clusters, n_terms=5):
    cluster_themes = {}
    for cluster in set(clusters):
        if cluster != -1:
            cluster_docs = tfidf_matrix[clusters == cluster]
            if cluster_docs.shape[0] > 0:
                centroid = cluster_docs.mean(axis=0)
                ordered_centroid = centroid.argsort()[::-1]
                top_terms = [vectorizer.get_feature_names_out()[i] for i in ordered_centroid[0, :n_terms]]
                cluster_themes[cluster] = ", ".join(top_terms)
    return cluster_themes

# **Function creates a mapping between spike dates and their corresponding clusters, including the themes**

In [131]:
def create_spike_cluster_map(spike_dates, cluster_spike_map, cluster_themes):
    spike_cluster_map = {}
    for spike_date in spike_dates:
        relevant_clusters = []
        for cluster, dates in cluster_spike_map.items():
            if spike_date in dates:
                relevant_clusters.append((cluster, cluster_themes.get(cluster, "No theme")))
        spike_cluster_map[spike_date] = relevant_clusters
    return spike_cluster_map



# **Ranks the clusters for each spike based on their impact score, which considers time proximity,**

In [132]:
def rank_clusters(spike_cluster_map, cluster_impact):
    ranked_clusters = {}
    for spike_date, cluster_list in spike_cluster_map.items():
        ranked = sorted(cluster_list, key=lambda x: cluster_impact.get(x[0], 0), reverse=True)
        ranked_clusters[spike_date] = ranked[:3]  # Top 3 clusters
    return ranked_clusters

# **Function creates a summary for each spike, including the top 3 clusters, their themes, and sample headlines**

In [133]:
def generate_spike_summary(ranked_clusters, relevant_news, clusters):
    summaries = {}
    for spike_date, top_clusters in ranked_clusters.items():
        summary = f"Spike Date: {spike_date}\n"
        for i, (cluster, theme) in enumerate(top_clusters, 1):
            summary += f"Cluster {cluster}: {theme}\n"
            cluster_news = [news for j, news in enumerate(relevant_news) if clusters[j] == cluster]
            if cluster_news:
                summary += f"Sample Headline: {cluster_news[0][:100]}...\n"
        summaries[spike_date] = summary
    return summaries

In [134]:
df = detect_spikes(df, threshold=1.5)
relevant_news = gather_news_around_spikes(df)
preprocessed_news = [preprocess_text(news) for news in relevant_news]


Mean of price_change: 0.9448
Standard deviation of price_change: 3.507924614412574
Number of rows where is_spike is True: 23
Number of rows where is_spike is False: 227


In [135]:
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(preprocessed_news)


In [136]:
cosine_sim = cosine_similarity(tfidf_matrix)
distance_matrix = np.sqrt(2 * (1 - cosine_sim))


  distance_matrix = np.sqrt(2 * (1 - cosine_sim))


In [125]:
np.fill_diagonal(distance_matrix, 0)  # Set diagonal to 0
distance_matrix = np.nan_to_num(distance_matrix, nan=np.nanmax(distance_matrix))

In [138]:
dbscan = DBSCAN(eps=0.5, min_samples=1, metric='precomputed')
clusters = dbscan.fit_predict(distance_matrix)

In [139]:
# Get relevant dates
spike_dates = df[df['is_spike']]['event_date'].tolist()
news_dates = df[df['event_date'].isin(spike_dates)]['news_date'].tolist()


In [143]:
cluster_spike_map = associate_clusters_with_spikes(spike_dates, news_dates, clusters)


In [144]:
cluster_impact = calculate_cluster_impact(cluster_spike_map, clusters)


In [142]:
# Print diagnostic information
print(f"Number of clusters: {len(set(clusters))}")
print(f"Number of relevant news articles: {len(relevant_news)}")
print(f"Shape of distance matrix: {distance_matrix.shape}")
print(f"Number of spike dates: {len(spike_dates)}")
print(f"Number of news dates: {len(news_dates)}")

Number of clusters: 64
Number of relevant news articles: 88
Shape of distance matrix: (88, 88)
Number of spike dates: 23
Number of news dates: 25
