Group data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the datasets
df1 = pd.read_csv(r'D:\CODING\Project\NVIDIA-Stock-prediction\data\synthesis\data.csv')
df2 = pd.read_csv(r'D:\CODING\Project\NVIDIA-Stock-prediction\data\synthesis\classification\BERT_EmbeddingText_classification.csv')

# Merge the DataFrames on 'keyword' and group by 'date' and 'Cluster'
df = pd.merge(df1, df2, left_on='keyword', right_on='keyword', how='inner').groupby(['date', 'Cluster']).agg({
    'cpc': np.mean,
    'keyword': list  # Keeping keyword as a list, but not used for plotting
}).reset_index()

# Drop 'keyword' column since it's not needed for the plot
df = df.drop('keyword', axis=1)

# Ensure the date is in datetime format for correct plotting
df['date'] = pd.to_datetime(df['date'])

# Pivot the dataframe to get CPC values for each cluster
df_pivot = df.pivot_table(index='date', columns='Cluster', values='cpc', aggfunc=np.mean)

# Plotting the data
plt.figure(figsize=(24, 8))

# Plot the CPC values for each cluster over time
for cluster in df_pivot.columns:
    plt.plot(df_pivot.index, df_pivot[cluster], label=f'Cluster {cluster}')

# Formatting the plot
plt.xticks(rotation=45)
plt.title('CPC Trends by Cluster Over Time')
plt.xlabel('Date')
plt.ylabel('Average CPC')
plt.legend(title='Cluster')
plt.show()
