# Real Time Crypto Data Processing
## Step 1: Load Dataset Safely (Skip Corrupted Rows)

In [1]:
import pandas as pd

df = pd.read_csv(
    "C:/Users/h11ba/Realtime_Crypto_project/data/raw/crypto_realtime_data.csv",
    on_bad_lines="skip"
)

print(df.shape)
df.head()


(11518, 7)


Unnamed: 0,Time,Coin,Price,Best_Bid,Best_Ask,Last_Size,Side
0,2026-01-24T15:26:50.379702Z,SOL-USD,127.05,127.04,127.05,0.07792208,buy
1,2026-01-24T15:26:50.388615Z,ETH-USD,2960.57,2960.56,2960.57,0.00334395,buy
2,2026-01-24T15:26:51.099457Z,BTC-USD,89285.78,89285.77,89285.78,1.663e-05,buy
3,2026-01-24T15:26:51.213773Z,BTC-USD,89285.78,89285.77,89285.78,1e-08,buy
4,2026-01-24T15:26:51.250405Z,BTC-USD,89285.78,89285.77,89285.78,0.00028309,buy


## STEP 2: Remove Duplicates and Empty Rows

In [2]:
# Remove duplicate records
df.drop_duplicates(inplace=True)

# Remove rows with missing values
df.dropna(inplace=True)


In [3]:
print("Remaining rows:", df.shape[0])
print("Null values per column:\n", df.isnull().sum())


Remaining rows: 11493
Null values per column:
 Time         0
Coin         0
Price        0
Best_Bid     0
Best_Ask     0
Last_Size    0
Side         0
dtype: int64


## STEP 3: Convert Time Column to DateTime

In [4]:
# Convert Time column safely
df["Time"] = pd.to_datetime(df["Time"], errors="coerce")

# Remove rows where Time conversion failed (corrupted timestamps)
df.dropna(subset=["Time"], inplace=True)

# Sort data by correct time order
df = df.sort_values("Time")


In [5]:
print("Time datatype:", df["Time"].dtype)
print("Any invalid Time left:", df["Time"].isnull().sum())
df["Time"].head()


Time datatype: datetime64[ns, UTC]
Any invalid Time left: 0


0   2026-01-24 15:26:50.379702+00:00
1   2026-01-24 15:26:50.388615+00:00
2   2026-01-24 15:26:51.099457+00:00
3   2026-01-24 15:26:51.213773+00:00
4   2026-01-24 15:26:51.250405+00:00
Name: Time, dtype: datetime64[ns, UTC]

## STEP 4: Convert Numeric Columns to Float

In [6]:
# Ensure numeric columns are in float type for ML
num_cols = ["Price", "Best_Bid", "Best_Ask", "Last_Size"]
df[num_cols] = df[num_cols].astype(float)


In [7]:
print(df[["Price","Best_Bid","Best_Ask","Last_Size"]].dtypes)


Price        float64
Best_Bid     float64
Best_Ask     float64
Last_Size    float64
dtype: object


## STEP 5: Encode Buy/Sell Column

In [8]:
# Convert Side column: buy → 1 , sell → 0
df["Side"] = df["Side"].map({"buy":1, "sell":0})


In [9]:
print("Side value counts:\n", df["Side"].value_counts())


Side value counts:
 Side
1    7288
0    4204
Name: count, dtype: int64


## STEP 6: Create Price Change Feature

In [10]:
# Difference between current price and previous price
df["Price_Change"] = df["Price"].diff()

# Remove first row created by diff()
df.dropna(inplace=True)


In [11]:
print(df[["Price","Price_Change"]].head())
print("Null values:", df["Price_Change"].isnull().sum())


      Price  Price_Change
1   2960.57       2833.52
2  89285.78      86325.21
3  89285.78          0.00
4  89285.78          0.00
5  89285.78          0.00
Null values: 0


## STEP 7: Create Lag Feature (Previous Price)

In [12]:
# Store previous price as a feature
df["Price_Lag1"] = df["Price"].shift(1)

# Remove NaN created by shift
df.dropna(inplace=True)


In [13]:
print(df[["Price","Price_Lag1"]].head())
print("Null values:", df["Price_Lag1"].isnull().sum())


      Price  Price_Lag1
2  89285.78     2960.57
3  89285.78    89285.78
4  89285.78    89285.78
5  89285.78    89285.78
6  89285.78    89285.78
Null values: 0


## STEP 8: Create Rolling Mean Feature

In [14]:
# Rolling average of last 10 prices to smooth short-term noise
df["Rolling_Mean_10"] = df["Price"].rolling(window=10).mean()

# Remove NaN created by rolling window
df.dropna(inplace=True)


In [15]:
print(df[["Price","Rolling_Mean_10"]].head())
print("Null values:", df["Rolling_Mean_10"].isnull().sum())


       Price  Rolling_Mean_10
11   2960.57        63104.865
12  89285.78        63104.865
13   2960.57        54472.344
14  89285.78        54472.344
15  89285.78        54472.344
Null values: 0


## STEP 9: Save Final Preprocessed Dataset

In [16]:
# Save clean ML-ready dataset
df.to_csv("processed_crypto_data.csv", index=False)

print("Preprocessing completed. File saved as processed_crypto_data.csv")


Preprocessing completed. File saved as processed_crypto_data.csv


## STEP 10: Remove Outliers (Abnormal Price Spikes)

In [17]:
# Remove extreme outliers in Price using IQR
#Crypto feeds sometimes log abnormal spikes.
#We remove them using IQR filtering.
Q1 = df["Price"].quantile(0.25)
Q3 = df["Price"].quantile(0.75)
IQR = Q3 - Q1

df = df[(df["Price"] >= Q1 - 1.5*IQR) & (df["Price"] <= Q3 + 1.5*IQR)]

print("After outlier removal:", df.shape)
#Prevents ML model from learning noise

After outlier removal: (11481, 10)


## STEP 11: Feature: Bid-Ask Spread

In [18]:
# Difference between ask and bid price
df["Spread"] = df["Best_Ask"] - df["Best_Bid"]

df.head()[["Best_Bid","Best_Ask","Spread"]]


Unnamed: 0,Best_Bid,Best_Ask,Spread
11,2960.56,2960.57,0.01
12,89285.77,89285.78,0.01
13,2960.56,2960.57,0.01
14,89285.77,89285.78,0.01
15,89285.77,89285.78,0.01


## STEP 12: Feature: Rolling Volatility

In [19]:
# Rolling standard deviation of price (volatility)
df["Rolling_Volatility"] = df["Price"].rolling(window=10).std()
df.dropna(inplace=True)


## STEP 13: Normalize Numeric Features

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scale_cols = ["Price","Best_Bid","Best_Ask","Last_Size",
              "Price_Change","Price_Lag1","Rolling_Mean_10",
              "Spread","Rolling_Volatility"]

df[scale_cols] = scaler.fit_transform(df[scale_cols])


## STEP 14: Save Scaler for Later Use

In [21]:
import joblib
joblib.dump(scaler, "scaler.save")


['scaler.save']

# ✅ Final Advanced Feature Set
## Save Final DataSet

In [22]:
df.to_csv("processed_crypto_data_final.csv", index=False)
print("Final preprocessing completed.")


Final preprocessing completed.


In [23]:
import os

# Get absolute path of project root (one level above notebooks)
PROJECT_ROOT = os.path.dirname(os.getcwd())

# Define outputs folder at project root
OUTPUT_PATH = os.path.join(PROJECT_ROOT, "outputs")

# Create folder if it does not exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Save dataset
final_path = os.path.join(OUTPUT_PATH, "crypto_realtime_preprocessed.csv")

df.to_csv(final_path, index=False)

print("Final preprocessed dataset saved at:", final_path)



Final preprocessed dataset saved at: c:\Users\h11ba\Realtime_Crypto_project\outputs\crypto_realtime_preprocessed.csv


In [24]:
print("Notebook working directory:", os.getcwd())
print("Project root:", PROJECT_ROOT)
print("Outputs folder exists:", os.path.exists(OUTPUT_PATH))


Notebook working directory: c:\Users\h11ba\Realtime_Crypto_project\notebooks
Project root: c:\Users\h11ba\Realtime_Crypto_project
Outputs folder exists: True
