# Intro to Dataframes

In [1]:
import pandas as pd 

df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]], columns=["A", "B", "C"], index=["x", "y", "z"])
# You can specify the columns and indexes as shown above

In [2]:
# To see the first 5 rows
df.head()

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [3]:
# To see the last 5 rows
df.tail()

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [4]:
# To acsess columns
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [5]:
# To acsess indexes
df.index.to_list()

['x', 'y', 'z']

In [6]:
# To get information about the dataframe
df.info()
"""
The output describes a pandas DataFrame with 3 rows (indexed x to z) and 3 columns (A, B, and C). 
All columns contain integer (int64) values with no missing data, and the entire DataFrame uses 
about 96 bytes of memory.
"""

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, x to z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


'\nThe output describes a pandas DataFrame with 3 rows (indexed x to z) and 3 columns (A, B, and C). \nAll columns contain integer (int64) values with no missing data, and the entire DataFrame uses \nabout 96 bytes of memory.\n'

In [7]:
# To get summary statistical information about the data
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [8]:
# To find how many unique values are in each column
df.nunique()
# For specific column you can use df["A"].nunique()

A    3
B    3
C    3
dtype: int64

In [9]:
# To know the shape of the dataframe (row x column)
df.shape

(3, 3)

In [10]:
# To know the number of items in the Dataframe
df.size

9

# Loading in Dataframes from Files

In [11]:
weather = pd.read_csv('data/weather.csv')

weather.head()

Unnamed: 0.1,Unnamed: 0,data,date,param,siteid
0,1,0.0,2003-01-01,Precipitation,ACRE
1,2,0.0,2003-01-02,Precipitation,AlbertLea
2,3,11.32,2003-01-03,Precipitation,Ames
3,4,0.0,2003-01-04,Precipitation,Antigo
4,5,3.04,2003-01-05,Precipitation,Appleton


# Accessing Data with Pandas

In [12]:
# To get full summary of the data
# You can also access the first 10 (x) rows by weather.head(x)
weather

Unnamed: 0.1,Unnamed: 0,data,date,param,siteid
0,1,0.00,2003-01-01,Precipitation,ACRE
1,2,0.00,2003-01-02,Precipitation,AlbertLea
2,3,11.32,2003-01-03,Precipitation,Ames
3,4,0.00,2003-01-04,Precipitation,Antigo
4,5,3.04,2003-01-05,Precipitation,Appleton
...,...,...,...,...,...
655357,655358,-1.96,2015-12-28,Minimum temperature,SEPAC
655358,655359,-3.86,2015-12-29,Minimum temperature,Seymour
655359,655360,-4.50,2015-12-30,Minimum temperature,Sutherland
655360,655361,-5.22,2015-12-31,Minimum temperature,Urbana


In [13]:
# To access random data
weather.sample(10)

Unnamed: 0.1,Unnamed: 0,data,date,param,siteid
342149,342150,0.0,2003-08-10,Precipitation,AlbertLea
433134,433135,28.52,2005-09-02,Maximum temperature,Urbana
200448,200449,5.32,2005-09-17,Precipitation,Monmouth
85027,85028,,2014-10-04,Minimum temperature,Kellogg
58071,58072,0.0,2005-12-19,Precipitation,Kellogg
480676,480677,3.09,2005-10-24,Minimum temperature,Lexington
4723,4724,0.0,2015-12-07,Precipitation,NEPAC
187457,187458,0.0,2009-02-24,Precipitation,Brookings
242053,242054,17.209999,2015-08-09,Minimum temperature,AlbertLea
599539,599540,0.0,2006-03-11,Precipitation,Lancaster


In [14]:
# To acess specific values using loc & iloc
# loc allows you to filter by rows and columns ie. weather.loc[[rows], [columns]]
weather.loc[[1,2,10], ["Unnamed: 0", "param"]]

Unnamed: 0.1,Unnamed: 0,param
1,2,Precipitation
2,3,Precipitation
10,11,Precipitation


In [15]:
# Using iloc (Integer Location)
"""
Definition:
.iloc (short for integer location) is used to access rows and columns in a DataFrame or Series 
by their integer position — similar to how you use list indexing in Python.

Key Points

Position-based indexer (uses numerical positions, not labels).
Can access single cells, entire rows, columns, or slices.
0-based indexing → first row/column is position 0.
End index in a slice is exclusive (like standard Python slicing).
Can handle lists, ranges, or boolean arrays for complex selections.

Can be used for both reading and writing data.

Syntax
df.iloc[row_index, column_index]

Element	Description
row_index	Row position(s) — int, list, slice, or boolean mask
column_index	Column position(s) — int, list, slice, or boolean mask

"""
# Getting a range of values 
weather.iloc[0:3,0]

# Getting a single row df.iloc[1]          

# Getting multiple rows df.iloc[0:2]        

# Getting specific rows and columns df.iloc[0:2, 0:2]  

# Getting non continious rows and columns df.iloc[[0, 2], [1, 2]]   

# Modify a value df.iloc[2, 1] = 23   

0    1
1    2
2    3
Name: Unnamed: 0, dtype: int64

In [16]:
# A specific way to get more optimized values
"""
Definition:
.at is used to access or modify a single value in a DataFrame or Series using row and column labels.

Key Points:
Label-based scalar accessor (uses index labels and column names)
Optimized for fast single-value access — faster than .loc
Works only for one cell at a time (not slices or multiple rows/columns)
Commonly used for getting or setting a specific cell value

Syntax:
df.at[row_label, column_label]
"""
weather.at[1, "Unnamed: 0"]

np.int64(2)

In [17]:
# A specific way to get more optimized values
"""
Definition:
.iat is used to access or modify a single value in a DataFrame or Series using integer positions.

Key Points:
Position-based scalar accessor (uses row and column indices)
Optimized for fast single-value access — faster than .iloc
Works only for one cell at a time
Commonly used when working with numerical row and column positions

Syntax:
df.iat[row_index, column_index]
"""
weather.iat[4,0]

np.int64(5)

In [18]:
# Sortign values with pandas
# It is ascending by default
weather.sort_values(["data", "date"], ascending=[0,0])

Unnamed: 0.1,Unnamed: 0,data,date,param,siteid
14870,14871,195.070007,2004-09-15,Precipitation,DixonSprings
500006,500007,180.020004,2006-09-23,Precipitation,Oceana
230712,230713,167.940002,2010-07-23,Precipitation,Lexington
231082,231083,155.330002,2011-07-28,Precipitation,McNay
189462,189463,155.070007,2014-08-22,Precipitation,PIT
...,...,...,...,...,...
75985,75986,,2003-01-02,Maximum temperature,Rosemount
80734,80735,,2003-01-02,Minimum temperature,Appleton
71235,71236,,2003-01-01,Precipitation,Monroe
75984,75985,,2003-01-01,Maximum temperature,Rhinelander


In [19]:
# You can iterate b/n values in a dataframe using ofr loop
# This is not recomended as it consumes memory (Used 1m 57.8s)
#for index, rows in weather.iterrows():
#   print(index)
#   print(rows)

# Filtering data

In [21]:
cancer = pd.read_csv('data/breast-cancer.csv')
cancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [22]:
# Checking general information of the data
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [None]:
# Starting to filter data
cancer.loc[cancer['radius_mean'] > 15, ['id', 'diagnosis', 'radius_mean', 'texture_mean']]
""" 
This line uses loc to filter and select data from the DataFrame. The expression cancer['radius_mean'] > 15 
creates a Boolean mask that keeps only the rows where the radius_mean value is greater than 15, and the list 
['id', 'diagnosis', 'radius_mean', 'texture_mean'] specifies the columns to display. Overall, it returns a 
filtered subset of the dataset showing only those entries with a large radius_mean along with their corresponding 
ID, diagnosis, and texture_mean.
"""
# Alternative syntax
# cancer[cancer['radius_mean'] > 15]['id', 'diagnosis', 'radius_mean', 'texture_mean']

Unnamed: 0,id,diagnosis,radius_mean,texture_mean
0,842302,M,17.99,10.38
1,842517,M,20.57,17.77
2,84300903,M,19.69,21.25
4,84358402,M,20.29,14.34
6,844359,M,18.25,19.98
...,...,...,...,...
563,926125,M,20.92,25.09
564,926424,M,21.56,22.39
565,926682,M,20.13,28.25
566,926954,M,16.60,28.08
