<a href="https://colab.research.google.com/github/leduong04/Time-Series/blob/main/Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install filterpy

Collecting filterpy
  Downloading filterpy-1.4.5.zip (177 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/178.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m143.4/178.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.0/178.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: filterpy
  Building wheel for filterpy (setup.py) ... [?25l[?25hdone
  Created wheel for filterpy: filename=filterpy-1.4.5-py3-none-any.whl size=110460 sha256=d35054609ab19cba2504d316307b136b5c34c399795a5b367462d3761d872d3e
  Stored in directory: /root/.cache/pip/wheels/12/dc/3c/e12983eac132d00f82a20c6cbe7b42ce6e96190ef8fa2d15e1
Successfully built filterpy
Installing collected packages: filterpy
Successfully installed filterpy-1.4.5


In [42]:
import pandas as pd
import numpy as np
from filterpy.kalman import KalmanFilter
import plotly.graph_objects as go

# Hàm khởi tạo Kalman Filter cơ bản
def initialize_kalman_filter_basic():
    kf = KalmanFilter(dim_x=2, dim_z=1)
    kf.x = np.array([[0.], [0.]])  # Trạng thái: [Listening_Time, velocity]
    kf.F = np.array([[1., 1.], [0., 1.]])  # Ma trận chuyển trạng thái
    kf.H = np.array([[1., 0.]])  # Ma trận quan sát
    kf.P *= 1000.  # Ma trận hiệp phương sai ban đầu
    kf.R = 5  # Hiệp phương sai nhiễu quan sát
    kf.Q = np.array([[0.1, 0.01], [0.01, 0.1]])  # Hiệp phương sai nhiễu quá trình
    return kf

# Hàm khởi tạo Kalman Filter với đặc trưng bổ sung
def initialize_kalman_filter_features():
    kf = KalmanFilter(dim_x=4, dim_z=1)
    kf.x = np.array([[0.], [0.], [0.], [0.]])  # Trạng thái: [Listening_Time, velocity, host_effect, guest_ads_effect]
    kf.F = np.array([[1., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.]])  # Ma trận chuyển trạng thái
    kf.H = np.array([[1., 0., 0., 0.]])  # Ma trận quan sát
    kf.P *= 1000.
    kf.R = 5
    kf.Q = np.array([[0.1, 0.01, 0., 0.], [0.01, 0.1, 0., 0.], [0., 0., 0.1, 0.], [0., 0., 0., 0.1]])
    return kf

# Hàm khởi tạo Kalman Filter với trạng thái động
def initialize_kalman_filter_dynamic():
    kf = KalmanFilter(dim_x=2, dim_z=1)
    kf.x = np.array([[0.], [0.]])  # Trạng thái: [Listening_Time, dynamic_factor]
    kf.F = np.array([[1., 1.], [0., 1.]])  # Ma trận chuyển trạng thái
    kf.H = np.array([[1., 0.]])  # Ma trận quan sát
    kf.P *= 1000.
    kf.R = 5
    kf.Q = np.array([[0.1, 0.01], [0.01, 0.1]])
    return kf

# Hàm chạy Kalman Filter và dự đoán
def run_kalman_filter(kf, measurements, is_training=True):
    predictions = []
    for z in measurements:
        kf.predict()
        if is_training:
            kf.update(z)
        predictions.append(kf.x[0, 0])
    return predictions

# Hàm chạy Kalman Filter với đặc trưng bổ sung
def run_kalman_filter_features(kf, measurements, host_popularity, guest_popularity, num_ads, is_training=True):
    predictions = []
    for z, host, guest, ads in zip(measurements, host_popularity, guest_popularity, num_ads):
        kf.predict()
        if is_training:
            kf.update(z)
        kf.x[2] += 0.05 * host  # Cập nhật trạng thái dựa trên Host_Popularity
        kf.x[3] += 0.03 * guest - 0.1 * ads  # Cập nhật dựa trên Guest_Popularity và Number_of_Ads
        predictions.append(kf.x[0, 0])
    return predictions

# Hàm chạy Kalman Filter với trạng thái động
def run_kalman_filter_dynamic(kf, measurements, episode_lengths, is_training=True):
    predictions = []
    for z, length in zip(measurements, episode_lengths):
        kf.predict()
        kf.F[0, 1] = 0.01 * length  # Cập nhật ma trận F dựa trên Episode_Length
        if is_training:
            kf.update(z)
        predictions.append(kf.x[0, 0])
    return predictions

# Hàm in thông số mô hình
def print_model_parameters(kf, model_name):
    print(f"\nParameters for {model_name}:")
    print(f"State Transition Matrix (F):\n{kf.F}")
    print(f"Observation Matrix (H):\n{kf.H}")
    print(f"Measurement Noise Covariance (R): {kf.R}")
    print(f"Process Noise Covariance (Q):\n{kf.Q}")
    print(f"Initial State Covariance (P):\n{kf.P}")

# Đọc dữ liệu từ file train và test (thay đường dẫn tại đây)
# train_data = pd.read_csv('/content/train.csv')
# test_data = pd.read_csv('/content/test.csv')

train_data = pd.read_csv('/content/train.csv')
print("TRAIN.INFO\n",train_data.info())
test_data = pd.read_csv('/content/test.csv')
print("TEST.INFO\n",test_data.info())

# Xử lý giá trị thiếu dựa trên tập train
train_data['Episode_Length_minutes'] = train_data['Episode_Length_minutes'].fillna(train_data['Episode_Length_minutes'].mean())
train_data['Guest_Popularity_percentage'] = train_data['Guest_Popularity_percentage'].fillna(train_data['Guest_Popularity_percentage'].mean())
test_data['Episode_Length_minutes'] = test_data['Episode_Length_minutes'].fillna(train_data['Episode_Length_minutes'].mean())
test_data['Guest_Popularity_percentage'] = test_data['Guest_Popularity_percentage'].fillna(train_data['Guest_Popularity_percentage'].mean())

# Lọc dữ liệu với Publication_Day là Saturday
train_saturday = train_data[train_data['Publication_Day'] == 'Saturday'].sort_values('id')
print("train_saturday.INFO\n",train_saturday.info())

test_saturday = test_data[test_data['Publication_Day'] == 'Saturday'].sort_values('id')
print("test_saturday.INFO\n",test_saturday.info())

# Chuẩn bị dữ liệu train
train_listening_times = train_saturday['Listening_Time_minutes'].values
train_host_popularity = train_saturday['Host_Popularity_percentage'].values
train_guest_popularity = train_saturday['Guest_Popularity_percentage'].values
train_num_ads = train_saturday['Number_of_Ads'].values
train_episode_lengths = train_saturday['Episode_Length_minutes'].values

# Chuẩn bị dữ liệu test
test_host_popularity = test_saturday['Host_Popularity_percentage'].values
test_guest_popularity = test_saturday['Guest_Popularity_percentage'].values
test_num_ads = test_saturday['Number_of_Ads'].values
test_episode_lengths = test_saturday['Episode_Length_minutes'].values
test_measurements = np.array([train_listening_times.mean()] * len(test_saturday))  # Measurements giả cho test

# Huấn luyện và đánh giá trên tập train
# Mô hình 1: Kalman Filter cơ bản
kf_basic = initialize_kalman_filter_basic()
predictions_basic_train = run_kalman_filter(kf_basic, train_listening_times, is_training=True)
mse_basic_train = np.mean((train_listening_times - predictions_basic_train) ** 2)
print_model_parameters(kf_basic, "Basic Kalman Filter")

# Mô hình 2: Kalman Filter với đặc trưng bổ sung
kf_features = initialize_kalman_filter_features()
predictions_features_train = run_kalman_filter_features(kf_features, train_listening_times,
                                                       train_host_popularity, train_guest_popularity,
                                                       train_num_ads, is_training=True)
mse_features_train = np.mean((train_listening_times - predictions_features_train) ** 2)
print_model_parameters(kf_features, "Features Kalman Filter")

# Mô hình 3: Kalman Filter với trạng thái động
kf_dynamic = initialize_kalman_filter_dynamic()
predictions_dynamic_train = run_kalman_filter_dynamic(kf_dynamic, train_listening_times,
                                                     train_episode_lengths, is_training=True)
mse_dynamic_train = np.mean((train_listening_times - predictions_dynamic_train) ** 2)
print_model_parameters(kf_dynamic, "Dynamic Kalman Filter")

# In hiệu suất trên tập train
print("\nPerformance on Train Set (MSE):")
print(f"Basic Kalman Filter: {mse_basic_train:.2f}")
print(f"Features Kalman Filter: {mse_features_train:.2f}")
print(f"Dynamic Kalman Filter: {mse_dynamic_train:.2f}")

# Lưu dự đoán trên tập train
train_results = pd.DataFrame({
    'id': train_saturday['id'],
    'Actual': train_listening_times,
    'Basic_Prediction': predictions_basic_train,
    'Features_Prediction': predictions_features_train,
    'Dynamic_Prediction': predictions_dynamic_train
})
train_results.to_csv('kalman_predictions_train.csv', index=False)



# Dự đoán trên tập test
predictions_basic_test = run_kalman_filter(kf_basic, test_measurements, is_training=False)
predictions_features_test = run_kalman_filter_features(kf_features, test_measurements, test_host_popularity,
                                                      test_guest_popularity, test_num_ads, is_training=False)
predictions_dynamic_test = run_kalman_filter_dynamic(kf_dynamic, test_measurements, test_episode_lengths, is_training=False)

# Lưu dự đoán trên tập test
test_results = pd.DataFrame({
    'id': test_saturday['id'],
    'Basic_Prediction': predictions_basic_test,
    'Features_Prediction': predictions_features_test,
    'Dynamic_Prediction': predictions_dynamic_test
})
test_results.to_csv('kalman_predictions_test.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB
