In [20]:
import os
import pandas as pd
import numpy as np

In [27]:
#part (ab)
folder_paths = ["bending1", "bending2", "cycling", "lying", 
                "sitting", "standing", "walking"]

columns = ['time', 'avg_rss12', 'var_rss12', 'avg_rss13', 
           'var_rss13', 'avg_rss23', 'var_rss23']

train_data = []
test_data = []
expected_columns = 7
problematic_lines = []

def detect_separator(file_path):
    with open(file_path, 'r') as file:
        for _ in range(5):
            next(file)
        sample = file.read(200)  
    
    if ',' in sample:
        return ','
    else:
        return '\s+' 

def process_file(file_path):
    sep = detect_separator(file_path)
    df = pd.read_csv(file_path, comment='#', header=None, usecols=range(7), sep=sep)
    df.columns = columns
    
    return df


for folder in folder_paths:
    overall_path = os.getcwd()
    new_path = os.path.dirname(overall_path)
    path = os.path.join(new_path, 'data', 'AReM', folder)
    
    files = os.listdir(path)
    
    for idx, file in enumerate(files):
        
        file_path = os.path.join(path, file)    
        df = process_file(file_path)
        if folder == "bending1" or folder == "bending2":
            if idx < 2:
                test_data.append(df)
            else:
                train_data.append(df)
        else:
            if idx < 3:
                test_data.append(df)
            else:
                train_data.append(df)

print(f"Test Data: {len(test_data)} files")
print(f"Train Data: {len(train_data)} files")

Test Data: 19 files
Train Data: 69 files


c(i)
 Research what types of time-domain features are usually used in time series classification and list them (examples are minimum, maximum, mean, etc)
 
Based on my research:
"In time series classification, time-domain features are extracted directly from the raw time series data. These features help capture the statistical properties and patterns of the data over time."

Here are some time-domain features:
Mean, Standard Deviation, Minimum and Maximum values, range, median, skewness, quartiles and IQR, Root Mean Square, Absolute Mean Difference, Entropy, trend

In [24]:
#2(ii)
def extract_features(df):
    features = {}
    for i, col in enumerate(df.columns[1:]): 
        features[f'min_{i+1}'] = df[col].min()
        features[f'max_{i+1}'] = df[col].max()
        features[f'mean_{i+1}'] = df[col].mean()
        features[f'median_{i+1}'] = df[col].median()
        features[f'std_{i+1}'] = df[col].std()
        features[f'1st_quart_{i+1}'] = df[col].quantile(0.25)
        features[f'3rd_quart_{i+1}'] = df[col].quantile(0.75)
    return features

train_features = pd.DataFrame([extract_features(df) for df in train_data])
test_features = pd.DataFrame([extract_features(df) for df in test_data])

total_features = pd.concat([train_features, test_features], axis=0).round(2)
print(total_features.head())


   min_1  max_1  mean_1  median_1  std_1  1st_quart_1  3rd_quart_1  min_2  \
0  35.00  47.40   43.95     44.33   1.56        43.00        45.00    0.0   
1  33.00  47.75   42.18     43.50   3.67        39.15        45.00    0.0   
2  33.00  45.75   41.68     41.75   2.24        41.33        42.75    0.0   
3  37.00  48.00   43.45     43.25   1.39        42.50        45.00    0.0   
4  36.25  48.00   43.97     44.50   1.62        43.31        44.67    0.0   

   max_2  mean_2  ...  std_5  1st_quart_5  3rd_quart_5  min_6  max_6  mean_6  \
0   1.70    0.43  ...   2.00        35.36        36.50    0.0   1.79    0.49   
1   3.00    0.70  ...   3.85        30.46        36.33    0.0   2.18    0.61   
2   2.83    0.54  ...   2.41        28.46        31.25    0.0   1.79    0.38   
3   1.58    0.38  ...   2.49        22.25        24.00    0.0   5.26    0.68   
4   1.50    0.41  ...   3.32        20.50        23.75    0.0   2.96    0.56   

   median_6  std_6  1st_quart_6  3rd_quart_6  
0      0.

In [25]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler_min_max = MinMaxScaler()
normalized_min_max = scaler_min_max.fit_transform(total_features)
normalized_data_min_max = pd.DataFrame(normalized_min_max, 
                                       columns=total_features.columns).round(2)

scaler_z_score = StandardScaler()
normalized_z_score = scaler_z_score.fit_transform(total_features)
normalized_data_z_score = pd.DataFrame(normalized_z_score, 
                                       columns=total_features.columns).round(2)

print("Min-Max Normalized Features DataFrame:\n")
print(normalized_data_min_max.head())

print("Z-score Normalized Features DataFrame:")
print(normalized_data_z_score.head())


Min-Max Normalized Features DataFrame:

   min_1  max_1  mean_1  median_1  std_1  1st_quart_1  3rd_quart_1  min_2  \
0   0.73   0.66    0.83      0.85   0.20         0.80         0.67    0.0   
1   0.69   0.68    0.75      0.81   0.48         0.64         0.67    0.0   
2   0.69   0.60    0.73      0.74   0.29         0.73         0.59    0.0   
3   0.77   0.69    0.81      0.80   0.18         0.78         0.67    0.0   
4   0.76   0.69    0.83      0.85   0.21         0.81         0.66    0.0   

   max_2  mean_2  ...  std_5  1st_quart_5  3rd_quart_5  min_6  max_6  mean_6  \
0   0.08    0.09  ...   0.04         1.00         1.00    0.0   0.00    0.03   
1   0.15    0.15  ...   0.34         0.86         0.99    0.0   0.03    0.07   
2   0.14    0.12  ...   0.11         0.80         0.83    0.0   0.00    0.00   
3   0.07    0.08  ...   0.12         0.62         0.60    0.0   0.29    0.10   
4   0.06    0.09  ...   0.26         0.57         0.59    0.0   0.10    0.06   

   median_6  std

In [26]:
import numpy as np
import pandas as pd

std_estimates = total_features.std()
confidence_intervals = {}

for column in total_features.columns:
    bootstrap_stds = []
    for i in range(1000):
        sample = total_features[column].sample(frac=1, replace=True)
        bootstrap_stds.append(sample.std())

    lower_percentile = (1 - 0.9) / 2
    upper_percentile = 1 - lower_percentile
    lower = np.percentile(bootstrap_stds, lower_percentile * 100)
    upper = np.percentile(bootstrap_stds, upper_percentile * 100)

    confidence_intervals[column] = (std_estimates[column], lower, upper)

result_df = pd.DataFrame(confidence_intervals, 
                                      index=['Standard Deviation', 
                                             'CI Lower Bound', 
                                             'CI Upper Bound']).T.round(4)

result_df['CI Length'] = result_df['CI Upper Bound'] - result_df['CI Lower Bound']

print("The 90% bootstrap confidence interval for the standard deviation of each feature is:")
print(result_df)


The 90% bootstrap confidence interval for the standard deviation of each feature is:
             Standard Deviation  CI Lower Bound  CI Upper Bound  CI Length
min_1                    9.5700          8.3447         10.8194     2.4747
max_1                    4.3944          3.3122          5.3073     1.9951
mean_1                   5.3352          4.6838          5.8540     1.1702
median_1                 5.4402          4.7990          6.0415     1.2425
std_1                    1.7721          1.5605          1.9458     0.3853
1st_quart_1              6.1535          5.5601          6.6853     1.1252
3rd_quart_1              5.1390          4.4030          5.8254     1.4224
min_2                    0.0000          0.0000          0.0000     0.0000
max_2                    5.0627          4.6144          5.3894     0.7750
mean_2                   1.5743          1.3924          1.7046     0.3122
median_2                 1.4121          1.2463          1.5315     0.2852
std_2          

I would choose the following three most important time domain features:

By side-by-side comparison:
Among min, min_1 is the most important feature because its standard deviation is larger and the confidence interval is shorter, indicating significant changes in the data.
Among max, max_5 is the most important feature because its standard deviation is high and the confidence interval is small, indicating that the data changes greatly and the estimate is stable.
In mean, mean_1 is the most important feature because its standard deviation is large and the confidence interval is narrow, indicating that its estimation accuracy is high and there is large variation in the data.
Among the median, median_1 is the most important feature because its standard deviation is high and the confidence interval is short, indicating better discrimination in the data.
In Std, std_5 is the most important feature because its standard deviation is higher and the confidence interval is shorter, indicating greater data variation and better estimation accuracy.
In Q1, 1st_quart_1 is the most important feature because it has a larger standard deviation and shorter confidence interval.
In Q3, 3rd_quart_1 is the most important feature, exhibiting higher standard deviation and shorter confidence interval.

From these seven statistics, I will further select the three most important time domain features. I will consider them based on the size of the standard deviation, the length of the confidence interval, and the representativeness of the statistics. Among them: mean_1, min_5 , and max_1 are the most representative

The standard deviation of mean_1 is 5.3352 and the confidence interval is [4.7079, 5.8833]. The confidence interval is relatively narrow, indicating that the estimate is stable, and the mean is an important feature in describing the central trend of the data.

min_5 (minimum value of the fifth time series): min_5 has a standard deviation of 6.1240 and a confidence interval of [4.3976, 7.5035]. Although the confidence interval is slightly wider, its standard deviation is larger, indicating that there is more variation in the data. The minimum value can reflect the lower bound of the data and is an important statistical feature.

max_1 (the maximum value of the first time series): max_1 has a standard deviation of 4.3944 and a confidence interval of [3.3320, 5.2827]. The confidence intervals are narrow and the estimates are stable. The maximum value reflects the upper bound of the data, and the combination of the minimum value and the mean value can fully describe the range and distribution of the data.

ISLR 3.7.4
(a) Since the cubic regression model is more complex and has more parameters than the linear regression model, the cubic regression model is able to fit the data more flexibly. So even if the true relationship is linear, the cubic model may still "overfit" the training data, resulting in a lower training RSS. So I would expect the polynomial regression have lower training RSS than the linear regression.
(b) For the test data, when the true relationship is linear, the cubic model introduces unnecessary complexity because it overfits the training data, so for the test set, the cubic model will have a higher test RSS, while the test RSS of the linear regression is expected to be lower than the test RSS of the cubic regression because the linear regression model is simpler and better reflects the true linear relationship.
(c) Polynomial regression has lower training RSS than the linear fit. Since the cubic regression model is more complex and has more parameters than the linear regression model, the cubic regression model is able to fit the data more flexibly. And when we assume that the true relationship between x and y is not linear, we need a more complex and flexible cubic regression model to give us a better fit for nonlinear data.
(d) This depends on the degree of nonlinearity, so we do not have enough information to confirm which one is better. For more complex linear relationships between x and y, the cubic regression model may perform better than the linear regression model on the test set, so the test RSS of cubic regression may be lower than that of linear regression. However, if the degree of nonlinearity between x and y is very low, the cubic regression model will be similar to the answer in (b). They will overfit the training data, so in this case the test RSS of linear regression is expected to be lower than that of cubic regression.