# Detecting Reversal Points in US Equities (Lightweight Local Pipeline)

This notebook demonstrates a fully local, dependency-free workflow for the competition.
The environment lacks pandas/NumPy/LightGBM, so the solution relies on Python's standard library
for data loading, exploratory analysis, and a scratch-built decision-tree classifier.


In [1]:
import csv
from collections import Counter, defaultdict
from statistics import mean, median
from pathlib import Path

DATA_DIR = Path('Dataset/competition_data')
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'
print('Train path:', train_path)
print('Test path:', test_path)


Train path: Dataset/competition_data/train.csv
Test path: Dataset/competition_data/test.csv


In [2]:
label_map = {'': 'None', 'HH': 'H', 'LH': 'H', 'HL': 'L', 'LL': 'L'}
class_counts = Counter()
with open(train_path) as f:
    reader = csv.reader(f)
    header = next(reader)
    cls_idx = header.index('class_label')
    for row in reader:
        class_counts[label_map.get(row[cls_idx], 'None')] += 1
print('Class distribution:', class_counts)
total = sum(class_counts.values())
for label in ['None', 'H', 'L']:
    share = 100 * class_counts[label] / total
    print(f"{label}: {class_counts[label]} samples ({share:.2f}% of training data)")


Class distribution: Counter({'None': 1820, 'H': 58, 'L': 54})
None: 1820 samples (94.15% of training data)
H: 58 samples (3.00% of training data)
L: 54 samples (2.79% of training data)


In [3]:
numeric_cols = ['momentum', 'sm_momentum', 'ratio', 'sm_ratio']
values = {col: [] for col in numeric_cols}
with open(train_path) as f:
    reader = csv.reader(f)
    header = next(reader)
    idx = {col: header.index(col) for col in numeric_cols}
    for row in reader:
        for col in numeric_cols:
            values[col].append(float(row[idx[col]]))
for col in numeric_cols:
    data = sorted(values[col])
    count = len(data)
    p25 = data[int(0.25 * (count - 1))]
    p50 = data[int(0.50 * (count - 1))]
    p75 = data[int(0.75 * (count - 1))]
    col_mean = sum(data) / count
    print(f"{col}: min={data[0]:.3f} | mean={col_mean:.3f} | median={p50:.3f} | p25={p25:.3f} | p75={p75:.3f} | max={data[-1]:.3f}")


momentum: min=96.088 | mean=99.992 | median=99.985 | p25=99.572 | p75=100.445 | max=102.465
sm_momentum: min=97.709 | mean=99.981 | median=99.965 | p25=99.519 | p75=100.434 | max=102.842
ratio: min=95.328 | mean=100.003 | median=99.982 | p25=99.402 | p75=100.634 | max=103.870
sm_ratio: min=98.494 | mean=99.969 | median=99.981 | p25=99.640 | p75=100.265 | max=101.501


In [4]:
from math import inf

def load_features(path):
    feats, labels = [], []
    with open(path) as f:
        reader = csv.reader(f)
        header = next(reader)
        idx = {col: header.index(col) for col in ['momentum', 'sm_momentum', 'ratio', 'sm_ratio']}
        cls_idx = header.index('class_label')
        for row in reader:
            feats.append([float(row[idx[col]]) for col in ['momentum', 'sm_momentum', 'ratio', 'sm_ratio']])
            labels.append(label_map.get(row[cls_idx], 0))
    return feats, labels

features, labels = load_features(train_path)
num_features = len(features[0])

class Node:
    __slots__ = ('feature','threshold','left','right','label')
    def __init__(self, feature=None, threshold=None, left=None, right=None, label=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.label = label

def gini(counts):
    total = sum(counts)
    if total == 0:
        return 0.0
    impurity = 1.0
    for count in counts:
        prob = count / total
        impurity -= prob * prob
    return impurity

def build_tree(indices, depth, max_depth, min_samples):
    counts = [0, 0, 0]
    for idx in indices:
        counts[labels[idx]] += 1
    majority = max(range(3), key=lambda c: counts[c])
    if depth >= max_depth or len(indices) <= min_samples or max(counts) == len(indices):
        return Node(label=majority)
    current = gini(counts)
    best_gain = -inf
    best_feature = None
    best_threshold = None
    total = len(indices)
    for feature_idx in range(num_features):
        sorted_idx = sorted(indices, key=lambda i: features[i][feature_idx])
        left_counts = [0, 0, 0]
        right_counts = counts.copy()
        for i in range(len(sorted_idx) - 1):
            idx = sorted_idx[i]
            lbl = labels[idx]
            left_counts[lbl] += 1
            right_counts[lbl] -= 1
            value = features[idx][feature_idx]
            next_value = features[sorted_idx[i + 1]][feature_idx]
            if value == next_value:
                continue
            threshold = (value + next_value) / 2
            left_total = i + 1
            right_total = total - left_total
            if left_total < min_samples or right_total < min_samples:
                continue
            impurity = (left_total / total) * gini(left_counts) + (right_total / total) * gini(right_counts)
            gain = current - impurity
            if gain > best_gain:
                best_gain = gain
                best_feature = feature_idx
                best_threshold = threshold
    if best_feature is None:
        return Node(label=majority)
    left_indices = [i for i in indices if features[i][best_feature] <= best_threshold]
    right_indices = [i for i in indices if features[i][best_feature] > best_threshold]
    left_node = build_tree(left_indices, depth + 1, max_depth, min_samples)
    right_node = build_tree(right_indices, depth + 1, max_depth, min_samples)
    return Node(feature=best_feature, threshold=best_threshold, left=left_node, right=right_node)

def predict(node, sample):
    while node.feature is not None:
        if sample[node.feature] <= node.threshold:
            node = node.left
        else:
            node = node.right
    return node.label

# Stratified 5-fold evaluation
indices = list(range(len(features)))
per_class = defaultdict(list)
for idx, label in enumerate(labels):
    per_class[label].append(idx)
folds = [[] for _ in range(5)]
for arr in per_class.values():
    random.shuffle(arr)
    for i, idx in enumerate(arr):
        folds[i % 5].append(idx)

cv_scores = []
for fold_idx, val_idx in enumerate(folds):
    train_idx = [i for i in indices if i not in val_idx]
    tree = build_tree(train_idx, 0, 3, 5)
    correct = 0
    for idx in val_idx:
        if predict(tree, features[idx]) == labels[idx]:
            correct += 1
    acc = correct / len(val_idx)
    cv_scores.append(acc)
    print(f"Fold {fold_idx+1} accuracy: {acc:.4f}")
print('Average accuracy:', sum(cv_scores)/len(cv_scores))


Fold 1 accuracy: 0.9406
Fold 2 accuracy: 0.9354
Fold 3 accuracy: 0.9406
Fold 4 accuracy: 0.9404
Fold 5 accuracy: 0.9455
Average accuracy: 0.9404813041853648


In [5]:
# Train final tree on full data
full_indices = list(range(len(features)))
final_tree = build_tree(full_indices, 0, 3, 5)
train_correct = sum(1 for idx in full_indices if predict(final_tree, features[idx]) == labels[idx])
print('Training accuracy on full data:', train_correct / len(features))

# Generate predictions for test set
pred_labels = []
pred_ids = []
with open(test_path) as f:
    reader = csv.reader(f)
    header = next(reader)
    idx = {col: header.index(col) for col in ['momentum', 'sm_momentum', 'ratio', 'sm_ratio']}
    id_idx = header.index('id') if 'id' in header else 0
    for row in reader:
        sample = [float(row[idx[col]]) for col in ['momentum', 'sm_momentum', 'ratio', 'sm_ratio']]
        label_idx = predict(final_tree, sample)
        pred_labels.append(label_names[label_idx])
        pred_ids.append(row[id_idx])

# Write submission
with open('submission_tree.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'class_label'])
    for idx, label in zip(pred_ids, pred_labels):
        writer.writerow([idx, label])
print('Saved submission_tree.csv with', len(pred_labels), 'rows')
counts = defaultdict(int)
for lbl in pred_labels:
    counts[lbl] += 1
print('Submission label distribution:', dict(counts))


Training accuracy on full data: 0.943064182194617
Saved submission_tree.csv with 828 rows
Submission label distribution: {'L': 3, 'None': 825}


In [6]:
with open('submission_tree.csv') as f:
    for _ in range(5):
        print(f.readline().strip())


id,class_label
0,None
1,None
2,None
3,None
