Trend off groups

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the datasets
df1 = pd.read_csv(r'data.csv')
df2 = pd.read_csv(r'BERT_EmbeddingText_classification.csv')

# Merge the DataFrames on 'keyword'
df = pd.merge(df1, df2, on='keyword', how='inner')

# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Group by month and cluster, then calculate mean CPC
df['month'] = df['date'].dt.to_period('M')  # Extract month from the date
df_monthly = df.groupby(['month', 'Cluster']).agg({'cpc': np.mean}).reset_index()

# Convert 'month' back to datetime for plotting
df_monthly['month'] = df_monthly['month'].dt.to_timestamp()

# Pivot the DataFrame for plotting
df_pivot = df_monthly.pivot(index='month', columns='Cluster', values='cpc')

# Plot the monthly mean CPC for each cluster
plt.figure(figsize=(14, 8))
for cluster in df_pivot.columns:
    plt.plot(df_pivot.index, df_pivot[cluster], label=f'Cluster {cluster}')

plt.xlabel('Month')
plt.ylabel('Mean CPC')
plt.title('Monthly Mean CPC for Each Cluster')
plt.xticks(rotation=45)
plt.legend(title='Cluster')
plt.grid()
plt.show()


trend of keyword

In [None]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('data.csv')

# Convert 'date' column to datetime format and set it as the index
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Input keyword to filter data
keyword = input('Enter keyword: ')

# Filter the data for the specific keyword
data = df[df['keyword'] == keyword]

# Check if data exists for the specified keyword
if data.empty:
    print(f"No data found for keyword '{keyword}'")
else:
    # Resample the data to ensure a continuous time series (e.g., daily)
    data = data.drop('keyword', axis=1)
    data = data.resample('D').mean()
    data['cpc'] = data['cpc'].interpolate(method='linear')
    print(data)

    # Split data into train and test sets
    train_data = data[:-30]
    test_data = data[-30:]

    # Define and fit the SARIMA model
    model = SARIMAX(train_data['cpc'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    model_fit = model.fit()

    # Forecast for the next 90 days
    forecast = model_fit.forecast(steps=30)

    # Create forecast index aligned with the date range
    forecast_index = pd.date_range(start=train_data.index[-1] + pd.Timedelta(days=1), periods=30, freq='D')
    forecast_series = pd.Series(forecast, index=forecast_index)

    data = data.last('180D')

    # Plot actual vs forecasted values
    plt.figure(figsize=(10, 5))
    plt.plot(data.index, data['cpc'], label='Actual CPC', linestyle='solid')
    plt.plot(forecast_index, forecast_series, label='Forecasted CPC', linestyle='solid', color='orange')
    plt.title(f'CPC Forecast for "{keyword}"', fontsize=14)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('CPC', fontsize=12)
    plt.axvline(x=train_data.index[-1], color='red', linestyle='--', label='Train/Test Split')
    plt.legend()
    plt.grid()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Print forecast values
    print(f"Forecasted CPC for '{keyword}':\n", forecast_series)


trend of group keyword

In [None]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('data.csv')
df_cluster = pd.read_csv(r'BERT_EmbeddingText_classification.csv')

df['cluster'] = df_cluster['Cluster']

print(df)


df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)


# keyword = input('Enter keyword: ')


# data = df[df['keyword'] == keyword]

cluster_num = int(input('Choose group of keyword: '))
data = df[df['cluster'] == cluster_num]
print(data)

map_cluster =['cooking', 'sprort', 'space', 'education', 'travel']

# Group by date and calculate the mean CPC for each date
data = data.groupby('date').agg({
    'cpc': 'mean'
})

# Resample data daily and interpolate missing CPC values
data = data.resample('D').mean()
data['cpc'] = data['cpc'].interpolate(method='linear')



train_data = data[:-30]
test_data = data[-30:]


model = SARIMAX(train_data['cpc'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
model_fit = model.fit()


forecast = model_fit.forecast(steps=30)


forecast_index = pd.date_range(start=train_data.index[-1] + pd.Timedelta(days=1), periods=30, freq='D')
forecast_series = pd.Series(forecast, index=forecast_index)

data = data.last('90D')


plt.figure(figsize=(10, 5))
plt.plot(data.index, data['cpc'], label='Actual CPC', linestyle='solid')
plt.plot(forecast_index, forecast_series, label='Forecasted CPC', linestyle='solid', color='orange')
plt.title(f'CPC Forecast for "{map_cluster[cluster_num]}" related keyword', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('CPC', fontsize=12)
plt.axvline(x=train_data.index[-1], color='red', linestyle='--', label='Train/Test Split')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
    # Print forecast values
print(f'Forecasted CPC for "{map_cluster[cluster_num]}" related keyword:\n', forecast_series)
