In [71]:
# Import Required Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [72]:
# Load Vodafone Group Dataset
# Adjust the file path if needed
csv_path = '../data/Vodafone_Group.csv'
data = pd.read_csv(csv_path, parse_dates=True)

# If there is a date column, parse it as datetime and set as index if appropriate
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    data.set_index('Date', inplace=True)

data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-08-27 00:00:00+01:00,114.07917,114.178801,112.61258,113.262184,36763655,0.0,0.0
2020-08-28 00:00:00+01:00,114.178816,114.577347,111.030434,111.030434,53187807,0.0,0.0
2020-09-01 00:00:00+01:00,110.83115,111.149974,105.960119,106.766151,103105885,0.0,0.0
2020-09-02 00:00:00+01:00,107.98166,109.217108,106.626657,108.141075,58794875,0.0,0.0
2020-09-03 00:00:00+01:00,108.459901,110.07395,107.124822,107.304161,60581907,0.0,0.0


In [73]:
# Define Technical Indicator Functions

def safe_series_operation(data):
    if isinstance(data, pd.Series):
        return data, True
    elif isinstance(data, pd.DataFrame):
        if data.shape[1] == 1:
            return data.iloc[:, 0], True
        else:
            return data.iloc[:, 0], True
    elif isinstance(data, np.ndarray):
        if data.ndim == 1:
            return pd.Series(data), True
        elif data.ndim == 2 and data.shape[1] == 1:
            return pd.Series(data.flatten()), True
        else:
            return pd.Series(data.flatten()), True
    else:
        return pd.Series(data), True

def safe_calculate_sma(data, window):
    series, success = safe_series_operation(data)
    if success and len(series) > window:
        return series.rolling(window=window, min_periods=1).mean(), True
    else:
        return pd.Series(index=data.index if hasattr(data, 'index') else range(len(data))), False

def safe_calculate_rsi(data, window=14):
    series, success = safe_series_operation(data)
    if not success or len(series) < window * 2:
        return pd.Series(index=data.index if hasattr(data, 'index') else range(len(data))), False
    delta = series.diff()
    gain = delta.where(delta > 0, 0).rolling(window=window, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window, min_periods=1).mean()
    loss_safe = loss.replace(0, 0.000001)
    rs = gain / loss_safe
    rsi = 100 - (100 / (1 + rs))
    rsi = rsi.fillna(50).replace([np.inf, -np.inf], 50)
    return rsi, True

def safe_calculate_macd(data, fast=12, slow=26, signal=9):
    series, success = safe_series_operation(data)
    if not success or len(series) < slow * 2:
        empty_series = pd.Series(index=data.index if hasattr(data, 'index') else range(len(data)))
        return empty_series, empty_series, empty_series, False
    exp1 = series.ewm(span=fast, adjust=False).mean()
    exp2 = series.ewm(span=slow, adjust=False).mean()
    macd_line = exp1 - exp2
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    histogram = macd_line - signal_line
    macd_line = macd_line.fillna(0)
    signal_line = signal_line.fillna(0)
    histogram = histogram.fillna(0)
    return macd_line, signal_line, histogram, True

def safe_calculate_adx(high, low, close, window=14):
    high_series, high_ok = safe_series_operation(high)
    low_series, low_ok = safe_series_operation(low)
    close_series, close_ok = safe_series_operation(close)
    if not (high_ok and low_ok and close_ok) or len(close_series) < window * 3:
        empty_series = pd.Series(index=close.index if hasattr(close, 'index') else range(len(close)))
        return empty_series, empty_series, empty_series, False
    tr1 = high_series - low_series
    tr2 = abs(high_series - close_series.shift(1))
    tr3 = abs(low_series - close_series.shift(1))
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1, skipna=True)
    high_diff = high_series.diff()
    low_diff = low_series.shift(1) - low_series
    dm_plus = pd.Series(np.where((high_diff > low_diff) & (high_diff > 0), high_diff, 0), index=high_series.index)
    dm_minus = pd.Series(np.where((low_diff > high_diff) & (low_diff > 0), low_diff, 0), index=low_series.index)
    atr = tr.rolling(window=window, min_periods=1).mean()
    atr_safe = atr.replace(0, 0.000001)
    di_plus = 100 * (dm_plus.rolling(window=window, min_periods=1).mean() / atr_safe)
    di_minus = 100 * (dm_minus.rolling(window=window, min_periods=1).mean() / atr_safe)
    di_sum = di_plus + di_minus
    di_sum_safe = di_sum.replace(0, 0.000001)
    dx = 100 * abs(di_plus - di_minus) / di_sum_safe
    adx = dx.rolling(window=window, min_periods=1).mean()
    adx = adx.fillna(25).replace([np.inf, -np.inf], 25)
    di_plus = di_plus.fillna(25).replace([np.inf, -np.inf], 25)
    di_minus = di_minus.fillna(25).replace([np.inf, -np.inf], 25)
    return adx, di_plus, di_minus, True

def safe_calculate_obv(close, volume):
    close_series, close_ok = safe_series_operation(close)
    volume_series, volume_ok = safe_series_operation(volume)
    if not (close_ok and volume_ok) or len(close_series) < 2:
        return pd.Series(index=close.index if hasattr(close, 'index') else range(len(close))), False
    price_change = close_series.diff()
    obv_change = pd.Series(np.where(price_change > 0, volume_series,
                                   np.where(price_change < 0, -volume_series, 0)),
                          index=close_series.index)
    obv = obv_change.cumsum()
    obv = obv.fillna(0)
    return obv, True

In [74]:
# Calculate SMA Crossover
sma_50, sma_50_success = safe_calculate_sma(data['Close'], 50)
sma_200, sma_200_success = safe_calculate_sma(data['Close'], 200)

data['sma_50'] = sma_50
data['sma_200'] = sma_200

data['sma_crossover'] = data['sma_50'] - data['sma_200']
data['sma_crossover'] = data['sma_crossover'].fillna(0)

In [75]:
# Calculate Price/SMA Ratio
sma_200_safe = data['sma_200'].replace(0, np.nan)
data['price_sma_ratio'] = data['Close'] / sma_200_safe
data['price_sma_ratio'] = data['price_sma_ratio'].fillna(1.0)

In [76]:
# Calculate RSI
rsi, rsi_success = safe_calculate_rsi(data['Close'])
data['rsi'] = rsi

In [77]:
# Calculate MACD and MACD Histogram
macd, macd_signal, macd_hist, macd_success = safe_calculate_macd(data['Close'])
data['macd'] = macd
data['macd_hist'] = macd_hist

In [78]:
# Calculate ADX
adx, adx_pos, adx_neg, adx_success = safe_calculate_adx(data['High'], data['Low'], data['Close'])
data['adx'] = adx

In [79]:
# Calculate OBV
obv, obv_success = safe_calculate_obv(data['Close'], data['Volume'])
data['obv'] = obv

In [80]:
# Combine Features and Save Prepared Dataset
features = ['sma_crossover', 'price_sma_ratio', 'rsi', 'macd', 'macd_hist', 'adx', 'obv']

# Fill any remaining NaN values
for col in features:
    if col in data.columns:
        data[col] = data[col].fillna(0)

# Save the prepared dataset
prepared_csv_path = '../data/Vodafone_Group_prepared.csv'
data.to_csv(prepared_csv_path)
print(f"Prepared dataset saved to {prepared_csv_path}")
data[features].head()

Prepared dataset saved to ../data/Vodafone_Group_prepared.csv


Unnamed: 0_level_0,sma_crossover,price_sma_ratio,rsi,macd,macd_hist,adx,obv
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-08-27 00:00:00+01:00,0.0,1.0,0.0,0.0,0.0,0.0,0
2020-08-28 00:00:00+01:00,0.0,0.99005,0.0,-0.178031,-0.142425,50.0,-53187807
2020-09-01 00:00:00+01:00,0.0,0.967497,0.0,-0.655656,-0.49604,66.666667,-156293692
2020-09-02 00:00:00+01:00,0.0,0.984892,17.468318,-0.912712,-0.602476,75.0,-97498817
2020-09-03 00:00:00+01:00,0.0,0.981733,15.789437,-1.170469,-0.688187,75.435835,-158080724


In [84]:
import os

data_folder = '../data'
exclude_files = ['Airtel_Africa.csv', 'dataset.csv']

for fname in os.listdir(data_folder):
    if fname.endswith('.csv') and fname not in exclude_files:
        fpath = os.path.join(data_folder, fname)
        df = pd.read_csv(fpath)
        # Drop columns if they exist
        cols_to_drop = [col for col in ['Dividends', 'Stock Splits'] if col in df.columns]
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)
            try:
                df.to_csv(fpath, index=False)
            except PermissionError:
                print(f"Permission denied: Could not write to {fpath}. Is the file open elsewhere?")
            except Exception as e:
                print(f"Error writing to {fpath}: {e}")


In [86]:
import os

# Remove records with any NaN or 0 in any column
cleaned_data = data.dropna()
cleaned_data = cleaned_data[(cleaned_data != 0).all(axis=1)]
data_folder = '../data'
exclude_files = ['dataset.csv']

for fname in os.listdir(data_folder):
    if fname.endswith('.csv') and fname not in exclude_files:
        fpath = os.path.join(data_folder, fname)
        df = pd.read_csv(fpath)
        # Remove records with any NaN or 0 in any column
        cleaned_df = df.dropna()
        cleaned_df = cleaned_df[(cleaned_df != 0).all(axis=1)]
        # Save cleaned file (overwrite original)
        try:
            cleaned_df.to_csv(fpath, index=False)
        except PermissionError:
            print(f"Permission denied: Could not write to {fpath}. Is the file open elsewhere?")
        except Exception as e:
            print(f"Error writing to {fpath}: {e}")