# Read and Write Files using Pandas

## Read csv files

- Relative path: are partial and depend on the current working directory
    - A single dot (./ or just the filename) refers to the current working directory.
    - Two dots (../) move one level up in the directory hierarchy.
    - Directory names separated by forward slashes (/) navigate into subfolders
    - Sometimes, depending on your IDE, you might want to do extra configuration to be able to use relative path. For example, in the case of PyCharm, you need to mark the current working directory as _Sources Root_.
- Absolute path: are complete and fixed

In [None]:
import pandas as pd

best_selling_books = pd.read_csv('data/data_reading_path/best_selling_books_2023_2025.csv')

print("DataFrame loaded from CSV:\n")
best_selling_books.head()

In [None]:
# Some of the parameters you can pass

best_selling_books = pd.read_csv('data/data_reading_path/best_selling_books_2023_2025.csv',
                                 # usecols: Subset of columns to select
                                 usecols=['Book name','Author','reviews count'],
                                 # dtype: dtype or dictionary of dtypes
                                 dtype={'Book name': str, 'reviews count': int},
                                 # nrows: number of rows to read. I would read a small chunk just to inspect data if the file is large
                                 nrows=10,
                                 )

print("DataFrame loaded from CSV:\n")
best_selling_books.head()


## Read Excel files

Before you read an Excel file, make sure openpyxl package is installed. openpyxl is a Python library to read/write Excel 2010 xlsx/xlsm/xltx/xltm files.

In [None]:
import pandas as pd

tools = pd.read_excel('data/data_reading_path/ONET_data_tools_used.xlsx',
                      sheet_name=0, # default 0, or a str of the name, or a list of the names
                      )

print("DataFrame loaded from xlsx files:\n")
tools.head()


In [None]:
# If you don't know the sheet names and wanted to inspect them, you could do the following:
import pandas as pd

file_to_check = pd.ExcelFile('data/data_reading_path/ONET_data_tools_used.xlsx')

print(file_to_check.sheet_names)

## Read JSON files
JSON (JavaScript Object Notation) is a lightweight, text-based, language-independent data-interchange format designed to be human-readable and machine-parsable.

In [None]:
# read the json file

iris = pd.read_json('data/data_reading_path/iris.json')

print("DataFrame loaded from JSON:\n")
print(iris.head())

## Pickling
Pickling can be useful to preserve Python obejects

### Example 1: pickling of basic Python objects

In [None]:
import pandas as pd

# Create a DataFrame where some cells contain lists and dictionaries
# This is common when working with JSON, APIs, or semi-structured data

df = pd.DataFrame({
    "species": ["Sparrow", "Robin", "Blue Jay"],

    # Each cell in this column contains a LIST
    "observed_colors": [
        ["brown", "gray"],
        ["red", "brown"],
        ["blue", "white", "black"]
    ],

    # Each cell in this column contains a DICTIONARY
    "measurements": [
        {"weight_g": 24, "wingspan_cm": 20},
        {"weight_g": 77, "wingspan_cm": 31},
        {"weight_g": 100, "wingspan_cm": 43}
    ]
})

print("DataFrame with lists and dictionaries in cells:")
print(df)


In [None]:
# save as a pickle file and a csv file

df.to_pickle('data/data_writing_path/birds_df.pkl')
df.to_csv('data/data_writing_path/birds_df.csv', index=False) # index=False prevent creating a new index column
print("DataFrame saved to pickle and csv.\n")


In [None]:
# Let's read the csv file and the pickle file

birds_df_csv = pd.read_csv('data/data_writing_path/birds_df.csv')
print("DataFrame loaded from csv:")
print(birds_df_csv)

birds_df_pickled = pd.read_pickle('data/data_writing_path/birds_df.pkl')
print("\nDataFrame loaded from pickle:")
print(birds_df_pickled)


In [None]:
print(f'The original dataframe contains list and dictionary columns, for example,\n'
      f'sparrow\'s observed colors is: {(colors:=(df[df["species"] == "Sparrow"]["observed_colors"].iloc[0]))}, and the type is {type(colors)}.')
print(f"\nHowever, the dataframe we read back from the CSV file, sparrow's observed color is "
      f"{(new_colors:=(birds_df_csv[birds_df_csv['species'] == 'Sparrow']['observed_colors'].iloc[0]))}, and the type is {type(new_colors)}.")

print(f"\nOn the other hand, in the dataframe we read back from the pickle file, it preserves the types. The sparrow's colors include {(pickle_colors:=(birds_df_pickled[birds_df_pickled['species']=='Sparrow']['observed_colors'].iloc[0]))}, and the type is {type(pickle_colors)}.")

In [None]:
# If you have columns in your CSV file that are supposed to be Python types other than integer, float, or string, you can use ast.literal_eval. ast stands for Abstract Syntax Tree
import ast

literal_evaled_df = pd.read_csv('data/data_writing_path/birds_df.csv')

# Convert the string column to a list column using .apply()
literal_evaled_df['observed_colors'] = literal_evaled_df['observed_colors'].apply(ast.literal_eval)
literal_evaled_df['measurements'] = literal_evaled_df['measurements'].apply(ast.literal_eval)


In [None]:
print(type(literal_evaled_df['observed_colors'].iloc[0]))

## Read only part of the file
If your data file is large, you can use `nrows` to control how many rows you read as a dataframe

In [None]:
chocolate_df = pd.read_csv('data/data_reading_path/chocolate_sales.csv', nrows=2)

print(chocolate_df)

## Chunking
To read a large file in chunks with pandas, you use the `chunksize` parameter in a file reading function like `pd.read_csv()`. This returns an iterator that yields smaller DataFrame objects (chunks) one by one, allowing you to process the data with limited memory.

For example, you can read the file chunk by chunk and conduct some operations chunk by chunk, which takes much smaller memories.

In [None]:
import pandas as pd

chunk_size = 500

chunks = []

counter = 0
for chunk in pd.read_csv('data/data_reading_path/chocolate_sales.csv',
                         chunksize=chunk_size):
    print(f"Chunk {counter} index: {chunk.index.min()} ~ {chunk.index.max()}, shape {chunk.shape}")
    # You can conduct some operations by chunk
    counter += 1
    chunks.append(chunk)

chocolate_df = pd.concat(chunks)

print("\nFinal dataframe:")
print(chocolate_df.head())

# Inspect Data

In [None]:
import pandas as pd

df = pd.read_csv('data/data_reading_path/chocolate_sales.csv')

print("Preview first 5 rows:")
print(df.head())

print("\nPreview last 5 rows:")
print(df.tail())

print("\nBasic information about the DataFrame:")
print(df.info())

print("\nDataFrame shape (rows, columns):")
print(df.shape)

print("\nEstimated memory usage:")
print(df.memory_usage(deep=True).sum(), "bytes")


# Writing data to files


In [None]:
import pandas as pd

save_path = "data/data_writing_path"

# Create a simple DataFrame
df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 32, 29],
    "city": ["New York", "Los Angeles", "Chicago"],
    "salary": [70000, 85000, 78000]
})

print(df)

In [None]:
# Write to CSV
df.to_csv(f"{save_path}/employees.csv", index=False)


In [None]:
# Write to Excel
# Again, it requires openpyxl package (usually already installed)

df.to_excel(f"{save_path}/employees.xlsx", index=False)


In [None]:
# Write to JSON
# orient="records" makes a list of dictionaries (very common in APIs)
# indent=2 makes it human-readable

df.to_json(f"{save_path}/employees.json", orient="records", indent=2)


In [None]:
# Write to Pickle
df.to_pickle(f"{save_path}/employees.pkl")


In [None]:
import os

files = [
    f"{save_path}/employees.csv",
    f"{save_path}/employees.xlsx",
    f"{save_path}/employees.json",
    f"{save_path}/employees.pkl"
]

for f in files:
    print(f"{f}: {os.path.getsize(f)} bytes")


# [Optional] Polars

Polars (linked here) is an open-source library for data manipulation, known for being one the fastest data processing solution on a single machine.


In [None]:

import pandas as pd
import polars as pl
import numpy as np
import time

n = 5_000_000

np.random.seed(42)

data = {
    "user_id": np.random.randint(1, 100_000, n),
    "category": np.random.choice(["A", "B", "C", "D", "E"], n),
    "price": np.random.gamma(2, 50, n),
    "quantity": np.random.randint(1, 5, n),
    "discount": np.random.rand(n)
}

df_pd = pd.DataFrame(data)
df_pl = pl.DataFrame(data)

print("Pandas shape:", df_pd.shape)
print("Polars shape:", df_pl.shape)


In [None]:
# let's do a series of computation:

start = time.time()

df_pd["total"] = df_pd["price"] * df_pd["quantity"]
df_pd["revenue"] = df_pd["total"] * (1 - df_pd["discount"])

result_pd = (
    df_pd
    .groupby(["user_id", "category"])
    .agg(
        total_revenue=("revenue", "sum"),
        avg_price=("price", "mean"),
        total_quantity=("quantity", "sum")
    )
    .reset_index()
    .sort_values("total_revenue", ascending=False)
)

end = time.time()
print("Pandas pipeline time:", end - start)


In [None]:
start = time.time()

result_pl = (
    df_pl
    .with_columns([
        (pl.col("price") * pl.col("quantity")).alias("total"),
        (pl.col("price") * pl.col("quantity") * (1 - pl.col("discount"))).alias("revenue")
    ])
    .group_by(["user_id", "category"])
    .agg([
        pl.col("revenue").sum().alias("total_revenue"),
        pl.col("price").mean().alias("avg_price"),
        pl.col("quantity").sum().alias("total_quantity")
    ])
    .sort("total_revenue", descending=True)
)

end = time.time()
print("Polars eager pipeline time:", end - start)


In [None]:
start = time.time()

result_lazy = (
    df_pl.lazy()
    .with_columns([
        (pl.col("price") * pl.col("quantity")).alias("total"),
        (pl.col("price") * pl.col("quantity") * (1 - pl.col("discount"))).alias("revenue")
    ])
    .group_by(["user_id", "category"])
    .agg([
        pl.col("revenue").sum().alias("total_revenue"),
        pl.col("price").mean().alias("avg_price"),
        pl.col("quantity").sum().alias("total_quantity")
    ])
    .sort("total_revenue", descending=True)
    .collect()
)

end = time.time()
print("Polars lazy pipeline time:", end - start)


# [optional] tqdm - progress bar

In [None]:
from tqdm import tqdm
import time

# Simulate a slow task
for i in tqdm(range(20)):
    time.sleep(0.15)

In [None]:
from tqdm.notebook import tqdm
import time

for i in tqdm(
    range(20),
    desc="Downloading",
    total=20,
    ncols=500,
    unit="file",
    colour="blue"
):
    time.sleep(0.15)


In [None]:

import pandas as pd
import time
import numpy as np

# Create a larger DataFrame
df = pd.DataFrame({
    "numbers": np.random.randint(1, 100, size=100)
})

df.head()


In [None]:
# Apply Function WITHOUT tqdm

def slow_square(x):
    time.sleep(0.05)
    return x ** 2

df["squared"] = df["numbers"].apply(slow_square)


In [None]:
# Apply Function WITH tqdm

from tqdm import tqdm

tqdm.pandas(desc="Processing Dataframe for EST 389: ")  # Enable progress_apply

df["squared"] = df["numbers"].progress_apply(slow_square)
