# python-setup-and-basics
Brief purpose: Reviewing Python fundamentals and initial data loading/analysis setup.

In [None]:
# Check Python Version
!python --version

In [None]:
import pandas as pd

# create a small sample dataset
data = {
    "date_col": ["2025-10-01", "2025-10-02", "2025-10-03"],
    "colA": [10, 20, 30],
    "colB": [1.5, 2.5, 3.5],
    "colC": ["x", "y", "z"]
}

df = pd.DataFrame(data)

# save it as CSV file
df.to_csv("sample_data.csv", index=False)

print("Sample CSV file created successfully!")
df.head()


In [None]:
# Fast I/O demo
df_fast = pd.read_csv(
    "sample_data.csv",
    usecols=["date_col", "colA", "colB"],         # load only specific columns
    dtype={"colA": "int32"},                      # specify data type
    parse_dates=["date_col"],                     # auto-parse dates
    low_memory=False
)

print(df_fast.shape)
print(df_fast.dtypes)
df_fast.head()


In [None]:
%timeit pd.read_csv("sample_data.csv")
%timeit pd.read_csv("sample_data.csv", usecols=["date_col","colA","colB"], dtype={"colA":"int32"}, parse_dates=["date_col"], low_memory=False)

In [None]:
import pandas as pd

def load_csv(path: str) -> pd.DataFrame:
    """
    Load a CSV file into a cleaned pandas DataFrame.
    
    Steps:
    1. Reads CSV from the given path.
    2. Strips whitespace from column names.
    3. Drops duplicate rows.
    4. Prints shape and column types.
    5. Returns the cleaned DataFrame.
    """
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"❌ File not found at path: {path}")
        return None
    
    # Clean-up operations
    df.columns = df.columns.str.strip()  # remove whitespace in headers
    df = df.drop_duplicates().reset_index(drop=True)
    
    print(f"✅ Loaded '{path}' successfully!")
    print(f"Shape: {df.shape}")
    print("Column types:\n", df.dtypes)
    return df


In [None]:
df_test = load_csv("../data/sample_data.csv")
df_test.head()

In [None]:
# ---- Pure Python implementation ----
def mean_py(data):
    """Compute mean using pure Python."""
    return sum(data) / len(data)

def variance_py(data):
    """Compute variance using pure Python."""
    m = mean_py(data)
    return sum((x - m) ** 2 for x in data) / len(data)

# Test it
sample_data = [10, 20, 30, 40, 50]
print("Data:", sample_data)
print("Mean (Python):", mean_py(sample_data))
print("Variance (Python):", variance_py(sample_data))


In [None]:
import numpy as np

# ---- NumPy implementation ----
arr = np.array(sample_data, dtype=np.float64)

mean_np = np.mean(arr)
var_np = np.var(arr)

print("Mean (NumPy):", mean_np)
print("Variance (NumPy):", var_np)

In [None]:
# Generate large data for timing test
big_list = list(range(1_000_000))
big_array = np.array(big_list, dtype=np.float64)

# Compare speed using Jupyter's %timeit magic
print("⏱️ Pure Python:")
%timeit mean_py(big_list); variance_py(big_list)

print("\n⚡ NumPy:")
%timeit np.mean(big_array); np.var(big_array)
