In [3]:
import pandas as pd

# Load the CSV file to check its structure and contents
file_path = 'rank_keywords_studylink_cleaned_imputed.csv'
data = pd.read_csv(file_path)

# Display the first few rows and column information of the data
data_info = data.info()
data_preview = data.head()

data_info, data_preview


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6569 entries, 0 to 6568
Data columns (total 50 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Keyword       6569 non-null   object 
 1   EndRank       6569 non-null   int64  
 2   EndClicks     6569 non-null   int64  
 3   SearchVolume  6569 non-null   int64  
 4   description   6569 non-null   object 
 5   TFIDF_score   6569 non-null   float64
 6   May-21        6569 non-null   float64
 7   Jun-21        6569 non-null   float64
 8   Jul-21        6569 non-null   float64
 9   Aug-21        6569 non-null   float64
 10  Sept-21       6569 non-null   float64
 11  Oct-21        6569 non-null   float64
 12  Nov-21        6569 non-null   float64
 13  Dec-21        6569 non-null   float64
 14  Jan-22        6569 non-null   float64
 15  Feb-22        6569 non-null   float64
 16  Mar-22        6569 non-null   float64
 17  Apr-22        6569 non-null   float64
 18  May-22        6569 non-null 

(None,
       Keyword  EndRank  EndClicks  SearchVolume  \
 0  calculator       34       1919        358000   
 1        nzqa       62        302        101000   
 2   studylink        1      13333         81300   
 3        myir       90        113         54300   
 4   ird login       49        182         48600   
 
                                          description  TFIDF_score  May-21  \
 0  We help students make informed choices about t...     0.002490    32.0   
 1  We help students make informed choices about t...     0.002157    54.0   
 2  We help students make informed choices about t...     0.002225     1.0   
 3  We help students make informed choices about t...     0.002112    53.8   
 4  We help students make informed choices about t...     0.002396    40.6   
 
    Jun-21  Jul-21  Aug-21  ...  Mar-24  Apr-24  May-24  Jun-24  Jul-24  \
 0    56.0    34.6    50.0  ...    29.0    22.6    30.0    20.2    22.0   
 1    50.8    57.4    62.8  ...    76.0    60.2    63.0    

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
import numpy as np

# Selecting the relevant columns for time series data
time_columns = [col for col in data.columns if '-' in col]  # Monthly columns
ranks = data[time_columns]

# Prepare training data for Linear Regression
X = np.arange(len(time_columns)).reshape(-1, 1)  # Time steps
y = ranks.values  # Ranks

# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X, y.mean(axis=0))  # Fit to average ranks across keywords
lr_predictions = lr_model.predict(np.arange(len(time_columns), len(time_columns) + 6).reshape(-1, 1))

# Prepare ARIMA Model for one keyword (e.g., "calculator")
arima_keyword_idx = 0  # Selecting the first keyword for simplicity
arima_model = ARIMA(ranks.iloc[arima_keyword_idx], order=(1, 1, 1))
arima_fitted = arima_model.fit()
arima_forecast = arima_fitted.forecast(steps=6)

# Evaluate Models
true_values = ranks.iloc[arima_keyword_idx, -6:]  # True values for the last 6 months

# Calculate errors for Linear Regression
lr_mae = mean_absolute_error(true_values, lr_predictions)
lr_mse = mean_squared_error(true_values, lr_predictions)

# Calculate errors for ARIMA
arima_mae = mean_absolute_error(true_values, arima_forecast)
arima_mse = mean_squared_error(true_values, arima_forecast)

# Comparison results
{
    "Linear Regression": {"MAE": lr_mae, "MSE": lr_mse},
    "ARIMA": {"MAE": arima_mae, "MSE": arima_mse},
    "LR Predictions": lr_predictions,
    "ARIMA Forecast": arima_forecast
}


  _index = to_datetime(index)
  self._init_dates(dates, freq)
  _index = to_datetime(index)
  self._init_dates(dates, freq)
  _index = to_datetime(index)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


{'Linear Regression': {'MAE': 29.531267475228002, 'MSE': 894.3928123736574},
 'ARIMA': {'MAE': 5.207984182837892, 'MSE': 43.44750686148626},
 'LR Predictions': array([53.84367803, 53.95871381, 54.07374959, 54.18878537, 54.30382115,
        54.41885693]),
 'ARIMA Forecast': 44    31.693698
 45    24.272023
 46    29.839330
 47    25.663061
 48    28.795855
 49    26.445815
 Name: predicted_mean, dtype: float64}

In [7]:
# Update the Python code to clearly display the keyword used in the analysis
# and include a table format for comparison

# Display the keyword being analyzed
keyword_analyzed = data.iloc[arima_keyword_idx]['Keyword']

# Create a comparison table
comparison_table = pd.DataFrame({
    "Metric": ["MAE (Mean Absolute Error)", "MSE (Mean Squared Error)", "Predicted Ranks (Next 6 Months)"],
    "Linear Regression": [lr_mae, lr_mse, list(np.round(lr_predictions, 2))],
    "ARIMA": [arima_mae, arima_mse, list(np.round(arima_forecast.values, 2))]
})

# Display the keyword analyzed and the comparison table
keyword_analyzed, comparison_table


('calculator',
                             Metric                          Linear Regression  \
 0        MAE (Mean Absolute Error)                                  29.531267   
 1         MSE (Mean Squared Error)                                 894.392812   
 2  Predicted Ranks (Next 6 Months)  [53.84, 53.96, 54.07, 54.19, 54.3, 54.42]   
 
                                        ARIMA  
 0                                   5.207984  
 1                                  43.447507  
 2  [31.69, 24.27, 29.84, 25.66, 28.8, 26.45]  )

In [9]:
# Update the table to ensure values are presented clearly without line breaks
comparison_table['ARIMA'] = comparison_table['ARIMA'].apply(lambda x: str(x))  # Convert list to string for better display
comparison_table


Unnamed: 0,Metric,Linear Regression,ARIMA
0,MAE (Mean Absolute Error),29.531267,5.207984182837892
1,MSE (Mean Squared Error),894.392812,43.44750686148626
2,Predicted Ranks (Next 6 Months),"[53.84, 53.96, 54.07, 54.19, 54.3, 54.42]","[31.69, 24.27, 29.84, 25.66, 28.8, 26.45]"
