# Pandas core data structure: `Dataframe`

## Creating a Dataframe
### Creating a Dataframe with Python built-in types


In [None]:
import pandas as pd

print("Create dataframe from a dictionary of lists.")

data = {
    "campus": ["Stony Brook University", "University at Albany", "Binghamton University"],
    "city": ["Stony Brook", "Albany", "Binghamton"],
    "region": ["Long Island", "Capital Region", "Southern Tier"]
}

df = pd.DataFrame(data) # or df = pd.DataFrame.from_dict((data)


print("SUNY campuses:")
print(df)


In [None]:
print("Create a DataFrame from a list of dictionaries.\nThis format is very common when reading JSON or API responses.\n")

courses = [
    {"course": "EST 371", "title": "Data Science", "credits": 3},
    {"course": "CSE 114", "title": "Introduction to Programming", "credits": 4},
    {"course": "AMS 315", "title": "Data Analysis", "credits": 3}
]

df = pd.DataFrame(courses)

print("Course offerings:")
print(df)


In [None]:
print("Create DataFrame from a List of Lists")

data = [
    ["New York City", 8804190, "NYC Metro"],
    ["Buffalo", 278349, "Western NY"],
    ["Rochester", 211328, "Finger Lakes"]
]

columns = ["city", "population", "region"]

# If we don't specify columns, the column name will be 0, 1, 2
df = pd.DataFrame(data, columns=columns)

print("New York State cities:")
print(df)


In [None]:
print("Create an empty dataframe.\nUseful for:collecting survey responses, "
      "and appending rows later.\n")

df = pd.DataFrame(columns=["student_id", "major", "year", "campus"])

print("Empty student survey DataFrame:")
print(df)

print(f"Check whether the dataframe is empty: {df.empty=}")


In [None]:
print("DataFrame with a Custom Index\n")

data = {
    "tuition_in_state": [10757, 10560, 10410],
    "tuition_out_state": [28717, 27930, 27230]
}

index = ["Stony Brook University", "University at Albany", "Binghamton University"]

df = pd.DataFrame(data, index=index)

print("SUNY tuition data:")
print(df)


### Creating a Dataframe with pandas Series


In [None]:
print("Create a DataFrame from a single Series. \nSeries index becomes the DataFrame index, and Series name becomes the column name.\n")

student_counts = pd.Series(
    [26000, 17000, 18000],
    index=["Stony Brook University", "University at Albany", "Binghamton University"],
    name="student_count"
)
print(student_counts)

In [None]:
df = pd.DataFrame(student_counts)

print("DataFrame from a single Series:")
print(df)


In [None]:
print("Create a DataFrame from Multiple Series (Same Index).\nMultiple Series with the same index align automatically by index.\n")

enrollment = pd.Series(
    [26000, 17000, 18000],
    index=["Stony Brook University", "University at Albany", "Binghamton University"],
    name="enrollment"
)

faculty = pd.Series(
    [2800, 1700, 1900],
    index=["Stony Brook University", "University at Albany", "Binghamton University"],
    name="faculty"
)

df = pd.DataFrame({
    "enrollment": enrollment,
    "faculty": faculty
})

print("DataFrame from multiple Series:")
print(df)


In [None]:
print("Series with Different Indexes (Automatic Alignment).\nPandas aligns data by index and fills missing values with NaN.\n")

graduation_rate = pd.Series(
    [0.75, 0.72],
    index=["Stony Brook University", "Binghamton University"],
    name="graduation_rate"
)

df = pd.DataFrame({
    "enrollment": enrollment,
    "faculty": faculty,
    "graduation_rate": graduation_rate
})

print("DataFrame with misaligned Series:")
print(df)


In [None]:
print("Create a DataFrame by Adding Series One by One.")

df = pd.DataFrame()

df["enrollment"] = enrollment
df["faculty"] = faculty

print("DataFrame built incrementally:")
print(df)


## Dataframe's attributes
### Common attributes

In [None]:
import pandas as pd

df = pd.DataFrame({
    "borough": ["Manhattan", "Brooklyn", "Queens", "Bronx", "Staten Island"],
    "population_millions": [1.63, 2.58, 2.27, 1.44, 0.49],
    "area_sq_miles": [22.8, 69.4, 108.1, 42.1, 58.4]
})

print("Original DataFrame:")
print(df)


In [None]:
# Transpose the DataFrame: Swaps rows and columns.

print("Transposed DataFrame (df.T):")
print(df.T)


In [None]:
# .columns returns the column labels of the DataFrame
# This is a Pandas Index object (not a Python list)
print("Column labels:")
print(df.columns)


In [None]:
# .dtypes shows the data type of each column
# Very useful for debugging and understanding operations
print("Data types of each column:")
print(df.dtypes)


In [None]:
# .empty returns True if the DataFrame has no rows or columns
print("Is df empty?")
print(df.empty)


In [None]:
# Create an empty DataFrame for comparison
empty_df = pd.DataFrame()

# Check if the empty DataFrame is empty
print("Is empty_df empty?")
print(empty_df.empty)


In [None]:
# .index returns the row labels of the DataFrame
# By default, this is a RangeIndex starting at 0
print("Row index:")
print(df.index)


In [None]:
# .shape returns a tuple: (number_of_rows, number_of_columns)
# This is often the first thing to check when debugging
print("Shape of the DataFrame (rows, columns):")
print(df.shape)


In [None]:
# .ndim returns the number of dimensions
# A DataFrame is always 2-dimensional
print("Number of dimensions (ndim):")
print(df.ndim)


In [None]:
# .values returns the underlying NumPy array
# Mixed data types result in dtype=object
print("NumPy representation using df.values:")
print(df.values)


In [None]:
# Recommended modern alternative to .values
print("NumPy representation using df.to_numpy():")
print(df.to_numpy())


### Quick structural sanity checks

In [None]:
# These checks help you quickly understand the structure of your data
print("Sanity check summary:")
print("Shape:", df.shape)
print("Columns:", df.columns)
print("Index:", df.index)
print("Data types:")
print(df.dtypes)


### Two ways of selecting a specific column as a Series

In [None]:
# The first way: using square bracket indexing
df['population millions']


In [None]:
# The second way: using dot notation
df.population_millions

### Dataframe's indexing attributes

In [1]:
import pandas as pd

# Create a DataFrame about bird species
# Rows represent bird species
# Columns represent characteristics
df = pd.DataFrame({
    "species": ["Sparrow", "Robin", "Blue Jay", "Cardinal", "Pigeon"],
    "avg_weight_g": [24, 77, 100, 45, 238],
    "wingspan_cm": [20, 31, 43, 30, 64]
})

# Set species as the index so we can demonstrate label-based indexing
df = df.set_index("species")

# Print the DataFrame
print("Bird species DataFrame:")
print(df)


Bird species DataFrame:
          avg_weight_g  wingspan_cm
species                            
Sparrow             24           20
Robin               77           31
Blue Jay           100           43
Cardinal            45           30
Pigeon             238           64


In [None]:
# .at is used to access ONE specific value.
# Best for: fast access to one scalar value and label-based lookup
# It requires an exact row label and an exact column label
# If the labels are not accurate, it will throw KeyError

value = df.at["Robin", "avg_weight_g"]

print("Average weight of Robin (grams) using .at:")
print(value)


In [None]:
# .iat accesses ONE value by integer position
# .iat ignores labels completely, and only uses position
# Row and column positions start from 0
# if the position does not exist, it will throw IndexError

value = df.iat[0, 0]

print("Value at row 0, column 0 using .iat:")
print(value)


In [None]:
# .loc selects rows and columns using labels
# Here we select two bird species and one column

subset = df.loc[
    ["Blue Jay", "Cardinal"],
    ["wingspan_cm"]
]

print("Wingspan of Blue Jay and Cardinal using .loc:")
print(subset)

In [None]:
# If we are only selecting certain columns, we can also omit .loc
# NOTE: using double square bracket returns a dataframe,
# while using single square bracket returns a series.

weight_df = df[['avg_weight_g']]

print("Average weight of the birds without using .loc")
print(weight_df)


In [2]:

# .loc can also be used with boolean conditions
# Here we select birds with average weight greater than 80 grams
# This pattern is extremely common in data science.

heavy_birds = df[df["avg_weight_g"] > 80]

print("Birds with average weight > 80 grams:")
print(heavy_birds)


Birds with average weight > 80 grams:
          avg_weight_g  wingspan_cm
species                            
Blue Jay           100           43
Pigeon             238           64


In [None]:
# .iloc selects data purely by integer position
# This selects the first three rows and the first two columns
# .iloc follows Python slicing rules, and the end index is exclusive

subset = df.iloc[0:3, 0:2]

print("First three birds and first two columns using .iloc:")
print(subset)


### More commonly, we often omit .loc when we do Boolean indexing

In [3]:
# Boolean indexing WITHOUT .loc
# Pandas assumes row filtering when given a boolean mask

df = pd.DataFrame({
    "species": ["Sparrow", "Robin", "Blue Jay", "Cardinal", "Pigeon"],
    "avg_weight_g": [24, 77, 100, 45, 238],
    "wingspan_cm": [20, 31, 43, 30, 64]
})

heavy_birds_no_loc = df[df["avg_weight_g"] > 80] # equivalent to df.loc[df["avg_weight_g"] > 80]

print("Heavy birds WITHOUT using .loc:")
print(heavy_birds_no_loc)


Heavy birds WITHOUT using .loc:
    species  avg_weight_g  wingspan_cm
2  Blue Jay           100           43
4    Pigeon           238           64


In [5]:
# Boolean indexing with multiple conditions
# Parentheses are REQUIRED when combining conditions
# Note: & means AND, and | means OR

selected_birds = df[
    (df["avg_weight_g"] > 50) &
    (df["wingspan_cm"] > 30)
]

print("Birds with weight > 50g OR wingspan > 30cm:")
print(selected_birds)


Birds with weight > 50g OR wingspan > 30cm:
    species  avg_weight_g  wingspan_cm
1     Robin            77           31
2  Blue Jay           100           43
4    Pigeon           238           64


In [7]:
# Negating conditions using ~ (logical NOT)
# This is handy when it is easier to write a condition that returns True,
# for example, when using regex or other complex matching conditions.
# Instead of rewriting the logic, we simply negate the result.

# Select birds whose average weight is NOT greater than 50 grams
# This includes birds with weight <= 50 grams
selected_birds = df[~(df["avg_weight_g"] > 50)]

print("Birds with average weight NOT greater than 50 grams:")
print(selected_birds)

Birds with average weight NOT greater than 50 grams:
    species  avg_weight_g  wingspan_cm
0   Sparrow            24           20
3  Cardinal            45           30


In [9]:
# Suppose we want birds whose species name does NOT contain the word "Jay"

selected_birds_no_jay = df[~df['species'].str.contains("Jay")]

print("Birds whose species name does NOT contain 'Jay':")
print(selected_birds_no_jay)


Birds whose species name does NOT contain 'Jay':
    species  avg_weight_g  wingspan_cm
0   Sparrow            24           20
1     Robin            77           31
3  Cardinal            45           30
4    Pigeon           238           64


In [10]:
# another example of using ~ with regex we wrote last week

import re

def is_valid_phone_regex(phone: str) -> bool:
    phone_pattern = re.compile(r"^\d{3}([ -]?)\d{3}\1\d{4}$")
    return re.match(pattern=phone_pattern, string=phone.strip()) is not None


In [11]:
# create a dataframe of phone numbers
import pandas as pd

phone_df = pd.DataFrame(
    {"name": ["Alice", "Bob", "Charlie", "David"],
     "major": ['EST', 'CSE', 'APM', "EST"],
     "phone_number":["123-456-7890", "12A-NBDG-eru", "302-567-75490", "789 345 2486"],}
)
print(phone_df)

      name major   phone_number
0    Alice   EST   123-456-7890
1      Bob   CSE   12A-NBDG-eru
2  Charlie   APM  302-567-75490
3    David   EST   789 345 2486


In [12]:
# If we wanted to identify the rows with INVALID phone number,
# but we only have a function that use regex to recognize VALID phone number, we can use ~

invalid_phone_numbers = phone_df[~phone_df["phone_number"].apply(is_valid_phone_regex)]
print(invalid_phone_numbers)

      name major   phone_number
1      Bob   CSE   12A-NBDG-eru
2  Charlie   APM  302-567-75490


### [Optional] Alternative way to do Boolean filtering: `query`
Although I prefer simply using `df[df[column] with condition]`

In [None]:
heavy_birds = df.query("avg_weight_g > 80")

print("Birds with average weight > 80 grams:")
print(heavy_birds)

In [None]:
# multiple conditions using df.query

selected_birds = df.query("(avg_weight_g > 50) & (wingspan_cm > 30)")

print("Birds with weight > 50g AND wingspan > 30cm:")
print(selected_birds)

In [None]:
# referencing a variable in df.query expression using @

threshold_weight = 80

heavy_birds = df.query("avg_weight_g > @threshold_weight")
# You can also use an f-string here f"avg_weight_g > {hreshold_weight}" but it is fragile and might corrupt code

print(f"Birds with average weight > {threshold_weight} grams:")
print(heavy_birds)
