# Intro to Dataframes

In [2]:
import pandas as pd 

df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]], columns=["A", "B", "C"], index=["x", "y", "z"])
# You can specify the columns and indexes as shown above

In [3]:
# To see the first 5 rows
df.head()

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [4]:
# To see the last 5 rows
df.tail()

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [5]:
# To acsess columns
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [6]:
# To acsess indexes
df.index.to_list()

['x', 'y', 'z']

In [7]:
# To get information about the dataframe
df.info()
"""
The output describes a pandas DataFrame with 3 rows (indexed x to z) and 3 columns (A, B, and C). 
All columns contain integer (int64) values with no missing data, and the entire DataFrame uses 
about 96 bytes of memory.
"""

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, x to z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


'\nThe output describes a pandas DataFrame with 3 rows (indexed x to z) and 3 columns (A, B, and C). \nAll columns contain integer (int64) values with no missing data, and the entire DataFrame uses \nabout 96 bytes of memory.\n'

In [8]:
# To get summary statistical information about the data
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [9]:
# To find how many unique values are in each column
df.nunique()
# For specific column you can use df["A"].nunique()

A    3
B    3
C    3
dtype: int64

In [10]:
# To know the shape of the dataframe (row x column)
df.shape

(3, 3)

In [11]:
# To know the number of items in the Dataframe
df.size

9

# Loading in Dataframes from Files

In [12]:
weather = pd.read_csv('data/weather.csv')

weather.head()

Unnamed: 0.1,Unnamed: 0,data,date,param,siteid
0,1,0.0,2003-01-01,Precipitation,ACRE
1,2,0.0,2003-01-02,Precipitation,AlbertLea
2,3,11.32,2003-01-03,Precipitation,Ames
3,4,0.0,2003-01-04,Precipitation,Antigo
4,5,3.04,2003-01-05,Precipitation,Appleton


# Accessing Data with Pandas

In [13]:
# To get full summary of the data
# You can also access the first 10 (x) rows by weather.head(x)
weather

Unnamed: 0.1,Unnamed: 0,data,date,param,siteid
0,1,0.00,2003-01-01,Precipitation,ACRE
1,2,0.00,2003-01-02,Precipitation,AlbertLea
2,3,11.32,2003-01-03,Precipitation,Ames
3,4,0.00,2003-01-04,Precipitation,Antigo
4,5,3.04,2003-01-05,Precipitation,Appleton
...,...,...,...,...,...
655357,655358,-1.96,2015-12-28,Minimum temperature,SEPAC
655358,655359,-3.86,2015-12-29,Minimum temperature,Seymour
655359,655360,-4.50,2015-12-30,Minimum temperature,Sutherland
655360,655361,-5.22,2015-12-31,Minimum temperature,Urbana


In [14]:
# To access random data
weather.sample(10)

Unnamed: 0.1,Unnamed: 0,data,date,param,siteid
435303,435304,25.309999,2011-08-11,Maximum temperature,Arlington
373708,373709,4.74,2011-12-31,Precipitation,Appleton
532703,532704,21.620001,2005-03-26,Maximum temperature,Manhattan
397837,397838,-7.22,2013-01-18,Minimum temperature,MorrisMN
432888,432889,3.69,2004-12-30,Maximum temperature,MorrisIL
349416,349417,27.58,2010-07-02,Maximum temperature,ACRE
652052,652053,-6.3,2006-12-10,Minimum temperature,Ames
563176,563177,22.18,2010-08-26,Maximum temperature,Urbana
132973,132974,3.71,2003-01-02,Maximum temperature,Perry
588611,588612,0.0,2015-04-12,Precipitation,SEPAC


In [None]:
# To acess specific values using loc & iloc
# loc allows you to filter by rows and columns ie. weather.loc[[rows], [columns]]
weather.loc[[1,2,10], ["Unnamed: 0", "param"]]

Unnamed: 0.1,Unnamed: 0,param
1,2,Precipitation
2,3,Precipitation
10,11,Precipitation


In [None]:
# Using iloc (Integer Location)
"""
Definition:
.iloc (short for integer location) is used to access rows and columns in a DataFrame or Series 
by their integer position — similar to how you use list indexing in Python.

Key Points

Position-based indexer (uses numerical positions, not labels).
Can access single cells, entire rows, columns, or slices.
0-based indexing → first row/column is position 0.
End index in a slice is exclusive (like standard Python slicing).
Can handle lists, ranges, or boolean arrays for complex selections.

Can be used for both reading and writing data.

Syntax
df.iloc[row_index, column_index]

Element	Description
row_index	Row position(s) — int, list, slice, or boolean mask
column_index	Column position(s) — int, list, slice, or boolean mask

"""
# Getting a range of values 
weather.iloc[0:3,0]

# Getting a single row df.iloc[1]          

# Getting multiple rows df.iloc[0:2]        

# Getting specific rows and columns df.iloc[0:2, 0:2]  

# Getting non continious rows and columns df.iloc[[0, 2], [1, 2]]   

# Modify a value df.iloc[2, 1] = 23   

0    1
1    2
2    3
Name: Unnamed: 0, dtype: int64

In [None]:
# A specific way to get more optimized values
"""
Definition:
.at is used to access or modify a single value in a DataFrame or Series using row and column labels.

Key Points:
Label-based scalar accessor (uses index labels and column names)
Optimized for fast single-value access — faster than .loc
Works only for one cell at a time (not slices or multiple rows/columns)
Commonly used for getting or setting a specific cell value

Syntax:
df.at[row_label, column_label]
"""
weather.at[1, "Unnamed: 0"]

np.int64(2)

In [None]:
# A specific way to get more optimized values
"""
Definition:
.iat is used to access or modify a single value in a DataFrame or Series using integer positions.

Key Points:
Position-based scalar accessor (uses row and column indices)
Optimized for fast single-value access — faster than .iloc
Works only for one cell at a time
Commonly used when working with numerical row and column positions

Syntax:
df.iat[row_index, column_index]
"""
weather.iat[4,0]

np.int64(5)