In [15]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

### 1 - Random Split

- **`X`**: Represents the **features (input data)** of the dataset.
    - It could be a matrix where each row corresponds to a sample, and each column corresponds to a feature.
- **`y`**: Represents the **target labels (output data)** corresponding to the `X` samples.
    - classification: class labels
    - regression: continuous values.

In [18]:
# Load the diabetes dataset
diabetes = load_diabetes()

In [19]:
# Load features
X = diabetes.data

# Load target/labels
y = diabetes.target

In [20]:
# Split the dataset into 80% training and 20% testing
# The specific value of random_state does not have a deeper meaning. It’s simply an arbitrary number you choose to ensure reproducibility.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [29]:
# number of samples (rows) and features (columns) in the dataset.
X_train.shape 

(221, 10)

In [21]:
X_test.shape

(221, 10)

### 2 - Stratified Split

In [56]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

In [58]:
# Load the wine dataset
wine = load_wine()

In [60]:
# Use all features
X = wine.data

# Use the target(output) variable (e.g., 'target')
y = wine.target

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

- The purpose of stratifying the split is to maintain the proportions of target classes (in y) in both the training and testing data
- Stratification is used to ensure that the distribution of classes in the **target variable y** is preserved across the training and testing sets.

In [63]:
X_train.shape

(142, 13)

In [65]:
X_test.shape

(36, 13)

In [52]:
# Another exmaple
from sklearn.model_selection import train_test_split
import numpy as np

# Example data
X = np.random.rand(1000, 5)  # 1000 samples, 5 features
y = np.array([0] * 700 + [1] * 300)  # 70% class 0, 30% class 1

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=42)

# Calculate the proportions of each class in y_train and y_test
train_class_proportions = np.bincount(y_train) / len(y_train)
test_class_proportions = np.bincount(y_test) / len(y_test)

# Print the proportions
print(f"Class proportions in y_train: {train_class_proportions}")
print(f"Class proportions in y_test: {test_class_proportions}")

# both y_train and y_test have the same class distribution as the original target variable y
# with 70% of class 0 and 30% of class 1 in both the training and test sets.

Class proportions in y_train: [0.7 0.3]
Class proportions in y_test: [0.7 0.3]


## 3 - Time-based Split

In [None]:
import yfinance as yf

In [None]:
# Download the data
data = yf.download('AAPL', start='2020-01-01', end='2022-12-31')

In [None]:
# Split the data into training and testing sets
train_data = data.loc[:'2020-12-31']
test_data = data.loc['2021-01-01':]

In [None]:
# Print the first few rows of the training and testing sets
print("Training data:")
print(train_data.head())

print("\nTesting data:")
print(test_data.head())