<a href="https://colab.research.google.com/github/itsyashkhurana/Big-Data-Analystics/blob/main/exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import kagglehub

# Download the dataset
path = kagglehub.dataset_download("borismarjanovic/price-volume-data-for-all-us-stocks-etfs")
print("Path to dataset files:", path)

# Specify the dataset path
data_directory = os.path.join(path, "Data", "Stocks")


# List all .txt files in the directory
txt_files = [file for file in os.listdir(data_directory) if file.endswith('.txt')]
print(f"Number of .txt files found: {len(txt_files)}")
print("Sample files:", txt_files[:5])

# Initialize an empty DataFrame for combined data
combined_data = pd.DataFrame()

# Loop through and read .txt files
for file in txt_files[:100]:  # Adjust range as needed
    try:
        file_path = os.path.join(data_directory, file)
        temp_data = pd.read_csv(file_path, delimiter=',')  # Update delimiter if necessary
        combined_data = pd.concat([combined_data, temp_data], ignore_index=True)
    except Exception as e:
        print(f"Error reading {file}: {e}")

# === Volume (Number of rows and columns) ===
print("=== Volume ===")
print("Combined Data Shape:", combined_data.shape)
print(f"Total size of the dataset: {combined_data.memory_usage(deep=True).sum() / (1024**2):.2f} MB")  # Memory size in MB

# === Variety (Data types and unique values) ===
print("=== Variety ===")
print("Data Types:\n", combined_data.dtypes)

# List unique values for categorical features (if applicable)
categorical_columns = combined_data.select_dtypes(include='object').columns
if not categorical_columns.empty:
    for col in categorical_columns:
        print(f"Unique values in '{col}': {combined_data[col].nunique()}")

# Clean up 'Date' column and analyze time velocity
combined_data['Date'] = pd.to_datetime(combined_data['Date'], errors='coerce')  # Handle errors in date parsing
combined_data = combined_data.sort_values('Date')

# === Velocity (Average time between entries) ===
velocity = combined_data['Date'].diff().mean()
print("=== Velocity (Average Time Between Entries) ===")
print("Average Time Between Entries (Velocity):", velocity)
print(f"Time range of the dataset: From {combined_data['Date'].min()} to {combined_data['Date'].max()}")

# === Veracity (Missing values) ===
print("=== Veracity (Missing Values) ===")
print("Missing Values:\n", combined_data.isnull().sum())

# Check for outliers in numerical columns (e.g., using IQR method)
numerical_columns = combined_data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    Q1 = combined_data[col].quantile(0.25)
    Q3 = combined_data[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((combined_data[col] < (Q1 - 1.5 * IQR)) | (combined_data[col] > (Q3 + 1.5 * IQR)))
    print(f"Number of outliers in '{col}': {outliers.sum()}")

# === Value (Descriptive statistics and correlations) ===
print("=== Value (Descriptive Statistics) ===")
print("Descriptive Statistics:\n", combined_data.describe())

# Correlation matrix to understand relationships between numerical features
correlation_matrix = combined_data[numerical_columns].corr()
print("Correlation Matrix:\n", correlation_matrix)


Path to dataset files: /root/.cache/kagglehub/datasets/borismarjanovic/price-volume-data-for-all-us-stocks-etfs/versions/3
Number of .txt files found: 7195
Sample files: ['tile.us.txt', 'ggn_b.us.txt', 'cwco.us.txt', 'vrna.us.txt', 'cwai.us.txt']
Error reading bxg.us.txt: No columns to parse from file
=== Volume ===
Combined Data Shape: (199348, 7)
Total size of the dataset: 21.86 MB
=== Variety ===
Data Types:
 Date        object
Open       float64
High       float64
Low        float64
Close      float64
Volume       int64
OpenInt      int64
dtype: object
Unique values in 'Date': 8366
=== Velocity (Average Time Between Entries) ===
Average Time Between Entries (Velocity): 0 days 01:27:31.690770365
Time range of the dataset: From 1984-09-07 00:00:00 to 2017-11-10 00:00:00
=== Veracity (Missing Values) ===
Missing Values:
 Date       0
Open       0
High       0
Low        0
Close      0
Volume     0
OpenInt    0
dtype: int64
Number of outliers in 'Open': 14162
Number of outliers in 'Hig