In [11]:
import pandas as pd
from sklearn import mixture
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import acf
import numpy as np

# Load the data
df = pd.read_csv('stock_data_baba.csv', parse_dates=True, index_col='date')

# Select features to use for clustering
features = ["open", "low", "high", "close", "volume"]
data = df[features]

# Define a function to test whether a series is random
def is_random(series, threshold=0.1):
    # Compute the absolute values of the autocorrelations
    acf_values = np.abs(acf(series, nlags=30)[1:])
    print(acf_values)
    # Return whether the autocorrelations are less than the threshold
    return np.all(acf_values < threshold)

# Apply the function to each day
df['is_random'] = df['open'].rolling(window=300).apply(is_random)

# It is a good practice to scale the features before clustering
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[df['is_random'] == False])

# Estimate the optimal number of components
n_components = np.arange(1, 21)
models = [mixture.GaussianMixture(n, covariance_type='full', random_state=0).fit(scaled_data)
          for n in n_components]
bics = [model.bic(scaled_data) for model in models]
best_n = np.argmin(bics) + 1  # adding 1 because np.arange() starts from 1

# Train a Gaussian Mixture Model
gmm = mixture.GaussianMixture(n_components=best_n, covariance_type='full')
gmm.fit(scaled_data)

# Predict the labels for the data points
df.loc[df['is_random'] == False, 'cluster'] = gmm.predict(scaled_data)

print(df)
df.to_csv('cluster.csv')


                    stock_id     open     low    high    close  volume  \
date                                                                     
2023-06-15 09:30:00     BABA  91.4600  91.450  92.215  92.0700  921881   
2023-06-15 09:31:00     BABA  92.0694  91.260  92.080  91.2900  373924   
2023-06-15 09:32:00     BABA  91.3000  91.230  91.500  91.3246  187062   
2023-06-15 09:33:00     BABA  91.3300  91.300  91.746  91.6500  130774   
2023-06-15 09:34:00     BABA  91.6500  91.645  92.160  92.1269  290249   
...                      ...      ...     ...     ...      ...     ...   
2023-07-03 12:59:00     BABA  84.1050  84.080  84.160  84.1100  154122   
2023-07-03 13:02:00     BABA  84.0700  84.070  84.070  84.0700  192221   
2023-07-03 13:10:00     BABA  84.0700  84.070  84.070  84.0700       0   
2023-07-03 15:30:00     BABA  84.0700  84.070  84.070  84.0700       0   
2023-07-03 16:00:00     BABA  84.1900  84.190  84.190  84.1900     161   

                     is_random  clust