# Pandas
A powerful library for data manipulation and analysis in Python. It provides data structures like Series and DataFrame to handle structured data efficiently. Data cleaning, pre-processing, and analysis tasks are simplified using Pandas.

## Data Frames & Series

In [1]:
import pandas as pd

df = pd.read_csv("student_data.csv")
print(df)

print(type(df))  # <class 'pandas.core.frame.DataFrame'>

# Dataframe(Tabular Data) is a 2-dimensional labeled data structure with columns of potentially different types. It is similar to a spreadsheet or SQL table, or a dictionary of Series objects. It's the core data structure in Pandas for data manipulation and analysis.
# Row: sample entry/record
# Column: attribute/feature

student_ids = df['StudentID']
print(student_ids)

print(type(student_ids))  # <class 'pandas.core.series.Series'>
# Series(Single row or column) is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). It is similar to a column in a spreadsheet or a SQL table. Each element in a Series has an associated label (index) that can be used to access the data.

   StudentID          FullName  Data Structure Marks  Algorithm Marks  \
0     PH1001       Alif Rahman                  85.0             85.0   
1     PH1002     Fatima Akhter                  92.0             92.0   
2     PH1003     Imran Hossain                  88.0             88.0   
3     PH1004  Jannatul Ferdous                  78.0             78.0   
4     PH1005       Kamal Uddin                   NaN              NaN   
5     PH1006       Laila Begum                  75.0             75.0   
6     PH1007    Mahmudul Hasan                  80.0             80.0   
7     PH1008       Nadia Islam                  81.0             81.0   
8     PH1009        Omar Faruq                  72.0             72.0   
9     PH1010      Priya Sharma                  89.0             89.0   
10    PH1011      Rahim Sheikh                   NaN              NaN   
11    PH1012   Sadia Chowdhury                  85.0             85.0   
12    PH1013      Tanvir Ahmed                  75.

## Different File Loading Methods

In [3]:
# File can be found in txt, csv, excel, json, fast(parquet), sql, html etc. formats

# CSV File Loading
df_csv = pd.read_csv("student_data.csv")

# Excel File Loading
df_excel = pd.read_excel("student_marks.xlsx", sheet_name="Sheet1")

# Parquet File Loading
df_parquet = pd.read_parquet("students.parquet")

# JSON File Loading
df_json = pd.read_json("data.json")


## Basic Operations & Functions

In [None]:
df.head()  # First 5 rows of the dataframe or df.head(10) for first 10 rows

Unnamed: 0,StudentID,FullName,Data Structure Marks,Algorithm Marks,Python Marks,CompletionStatus,EnrollmentDate,Instructor,Location
0,PH1001,Alif Rahman,85.0,85.0,88.0,Completed,2024-01-15,Mr. Karim,Dhaka
1,PH1002,Fatima Akhter,92.0,92.0,,In Progress,2024-01-20,Ms. Salma,Chattogram
2,PH1003,Imran Hossain,88.0,88.0,85.0,Completed,2024-02-10,Mr. Karim,Dhaka
3,PH1004,Jannatul Ferdous,78.0,78.0,82.0,Completed,2024-02-12,Ms. Salma,Sylhet
4,PH1005,Kamal Uddin,,,95.0,In Progress,2024-03-05,Mr. Karim,Chattogram


In [None]:
df.tail()  # Last 5 rows of the dataframe or df.tail(10) for last 10 rows

Unnamed: 0,StudentID,FullName,Data Structure Marks,Algorithm Marks,Python Marks,CompletionStatus,EnrollmentDate,Instructor,Location
15,PH1016,Ziaur Rahman,94.0,94.0,,In Progress,2024-08-21,Ms. Salma,Chattogram
16,PH1017,Afsana Mimi,90.0,90.0,93.0,Completed,2025-09-01,Mr. Karim,Dhaka
17,PH1018,Babul Ahmed,88.0,88.0,85.0,Completed,2025-09-05,Ms. Salma,Sylhet
18,PH1019,Faria Rahman,,,,Not Started,2025-09-15,Mr. David,Chattogram
19,PH1020,Nasir Khan,86.0,86.0,89.0,Completed,2025-10-02,Ms. Salma,Dhaka


In [38]:
df_excel.sample(3)    # Random 3 rows

Unnamed: 0,StudentID,FullName,Data Structure Marks,Algorithm Marks,Python Marks,CompletionStatus,EnrollmentDate,Instructor,Location
12,PH1013,Tanvir Ahmed,75.0,75.0,79.0,Completed,2024-07-02,Mr. David,Dhaka
15,PH1016,Ziaur Rahman,94.0,94.0,,In Progress,2024-08-21,Ms. Salma,Chattogram
7,PH1008,Nadia Islam,81.0,81.0,85.0,Completed,2024-04-22,Ms. Salma,Chattogram


In [40]:
df.columns  # List of column names

Index(['StudentID', 'FullName', 'Data Structure Marks', 'Algorithm Marks',
       'Python Marks', 'CompletionStatus', 'EnrollmentDate', 'Instructor',
       'Location'],
      dtype='object')

In [41]:
df.index    # Row index labels

RangeIndex(start=0, stop=20, step=1)

In [46]:
df.info()
df.describe()   # Statistical summary of numerical columns. Only picks numeric columns by default

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   StudentID             20 non-null     object 
 1   FullName              20 non-null     object 
 2   Data Structure Marks  16 non-null     float64
 3   Algorithm Marks       16 non-null     float64
 4   Python Marks          15 non-null     float64
 5   CompletionStatus      20 non-null     object 
 6   EnrollmentDate        20 non-null     object 
 7   Instructor            20 non-null     object 
 8   Location              20 non-null     object 
dtypes: float64(3), object(6)
memory usage: 1.5+ KB


Unnamed: 0,Data Structure Marks,Algorithm Marks,Python Marks
count,16.0,16.0,15.0
mean,84.0,84.0,85.666667
std,6.501282,6.501282,5.394
min,72.0,72.0,76.0
25%,79.5,79.5,83.0
50%,85.5,85.5,85.0
75%,88.25,88.25,88.5
max,94.0,94.0,95.0


##  Dataframe from Inbuilt Data with Column and Indexes

In [5]:
# Dataframe Creation 

lst = [['Alice', 24], ['Bob', 22], ['Charlie', 23]]
lst_df = pd.DataFrame(lst, columns=['Name', 'Age'],index=[1,2,3])
print(lst_df)
print("----------------------")

tpl= [('David', 25), ('Eva', 21), ('Frank', 24)]
tpl_df = pd.DataFrame(tpl, columns=['Name', 'Age'], index=[4,5,6])
print(tpl_df)
print("----------------------")

dic = {'Name': ['Grace', 'Hannah', 'Ian'], 'Age': [22, 23, 24]}
dic_df = pd.DataFrame(dic, index=[7,8,9])       #Here keys become column names
print(dic_df)
print("----------------------")

lst_dic = [{'Name': 'Jack', 'Age': 26,"city": "New York"}, {'Name': 'Kathy', 'Age': 27,"city": "Los Angeles"}, {'Name': 'Liam', 'Age': 28}]
lst_dic_df = pd.DataFrame(lst_dic, index=[10,11,12])
print(lst_dic_df)


      Name  Age
1    Alice   24
2      Bob   22
3  Charlie   23
----------------------
    Name  Age
4  David   25
5    Eva   21
6  Frank   24
----------------------
     Name  Age
7   Grace   22
8  Hannah   23
9     Ian   24
----------------------
     Name  Age         city
10   Jack   26     New York
11  Kathy   27  Los Angeles
12   Liam   28          NaN


In [None]:
# Accessing Data with Index and Columns
import pandas as pd
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 22, 23, 25],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}

df = pd.DataFrame(data)

# Accessing a column
ages = df['Age'] 
print(ages)

# loc - Accessing rows by label/index   loc(index_start:index_end, column_start:column_end)
# iloc - Accessing rows by integer position  iloc(position_start:position_end, position_start:position_end)

# loc vs iloc: loc includes the end index, while iloc excludes the end index. loc uses labels, iloc uses integer positions. loc can accept boolean arrays, iloc cannot. iloc is generally faster than loc for large datasets. iloc is not index label based.

# Using loc to access rows
first_row = df.loc[0]
print(first_row)
print("----------------------")

multiple_rows = df.loc[[1, 2]]
print(multiple_rows)
print("----------------------")

multiple_rows_range = df.loc[1:3]
print(multiple_rows_range)
print("----------------------")

# Using loc to access columns
age_city = df.loc[:, ['Age', 'City']]
print(age_city)
print("----------------------")

# Using loc to access rows with specific columns
specific_data = df.loc[1:3, ['Name', 'City']]
print(specific_data)
print("----------------------")

# Changing index and columns
new_df = df.set_index('Name')
print(new_df)
print("----------------------")

# Now 'Name' column is the index. So to access row of column, use iloc instead of loc as index is no longer integer based
multiple_rows_iloc = new_df.iloc[0:2]
print(multiple_rows_iloc)
print("----------------------")

specific_data_iloc = new_df.iloc[:, 0: 1]
print(specific_data_iloc)
print("----------------------")

# Rename Columns
renamed_df = df.rename(columns={'Name': 'Full Name', 'Age': 'Years'})
print(renamed_df)

0    24
1    22
2    23
3    25
Name: Age, dtype: int64
Name       Alice
Age           24
City    New York
Name: 0, dtype: object
----------------------
      Name  Age         City
1      Bob   22  Los Angeles
2  Charlie   23      Chicago
----------------------
      Name  Age         City
1      Bob   22  Los Angeles
2  Charlie   23      Chicago
3    David   25      Houston
----------------------
   Age         City
0   24     New York
1   22  Los Angeles
2   23      Chicago
3   25      Houston
----------------------
      Name         City
1      Bob  Los Angeles
2  Charlie      Chicago
3    David      Houston
----------------------
         Age         City
Name                     
Alice     24     New York
Bob       22  Los Angeles
Charlie   23      Chicago
David     25      Houston
----------------------
       Age         City
Name                   
Alice   24     New York
Bob     22  Los Angeles
----------------------
         Age
Name        
Alice     24
Bob       22
Charli

## Dropping rows and columns,Assigning values and Iteration

In [34]:
# Changing values
import numpy as np

df = pd.read_csv("student_data.csv")

# 1. Drop
df_dropped_row = df.drop(index=2)  # Drop row with index 2
print(df)
print("----------------------")

# 2. Drop Column
df_dropped_column = df.drop('EnrollmentDate', axis=1, inplace=True)  # Drop 'City' column, inplace=True modifies the original dataframe
print(df)
print("----------------------")

# 3. Assigning Values
df.loc[0, 'FullName'] = "Alice Johnson"  # loc[row_index, column_name] = new_value

# Aggregate Assigning Values
df.loc[1:3, 'Data Structure Marks'] += 5  # Increase 'Data Structure Marks' by 5 for rows with index 1 to 3

df


   StudentID          FullName  Data Structure Marks  Algorithm Marks  \
0     PH1001       Alif Rahman                  85.0             85.0   
1     PH1002     Fatima Akhter                  92.0             92.0   
2     PH1003     Imran Hossain                  88.0             88.0   
3     PH1004  Jannatul Ferdous                  78.0             78.0   
4     PH1005       Kamal Uddin                   NaN              NaN   
5     PH1006       Laila Begum                  75.0             75.0   
6     PH1007    Mahmudul Hasan                  80.0             80.0   
7     PH1008       Nadia Islam                  81.0             81.0   
8     PH1009        Omar Faruq                  72.0             72.0   
9     PH1010      Priya Sharma                  89.0             89.0   
10    PH1011      Rahim Sheikh                   NaN              NaN   
11    PH1012   Sadia Chowdhury                  85.0             85.0   
12    PH1013      Tanvir Ahmed                  75.

Unnamed: 0,StudentID,FullName,Data Structure Marks,Algorithm Marks,Python Marks,CompletionStatus,Instructor,Location
0,PH1001,Alice Johnson,85.0,85.0,88.0,Completed,Mr. Karim,Dhaka
1,PH1002,Fatima Akhter,97.0,92.0,,In Progress,Ms. Salma,Chattogram
2,PH1003,Imran Hossain,93.0,88.0,85.0,Completed,Mr. Karim,Dhaka
3,PH1004,Jannatul Ferdous,83.0,78.0,82.0,Completed,Ms. Salma,Sylhet
4,PH1005,Kamal Uddin,,,95.0,In Progress,Mr. Karim,Chattogram
5,PH1006,Laila Begum,75.0,75.0,78.0,Completed,Ms. Salma,Rajshahi
6,PH1007,Mahmudul Hasan,80.0,80.0,,In Progress,Mr. Karim,Dhaka
7,PH1008,Nadia Islam,81.0,81.0,85.0,Completed,Ms. Salma,Chattogram
8,PH1009,Omar Faruq,72.0,72.0,76.0,Completed,Mr. David,Dhaka
9,PH1010,Priya Sharma,89.0,89.0,88.0,Completed,Ms. Salma,Sylhet


In [None]:
# 4. Iteration (Actually Brodcasted Operation)

for i,series in df.iterrows():
    print(f"Index: {i}")
    print(series)
    print("-----")

for i in df.itertuples():
    print(i)    # Gives each row as a namedtuple

# itertuples is a faster way to iterate through rows as namedtuples

Index: 0
StudentID                      PH1001
FullName                Alice Johnson
Data Structure Marks             85.0
Algorithm Marks                  85.0
Python Marks                     88.0
CompletionStatus            Completed
Instructor                  Mr. Karim
Location                        Dhaka
Name: 0, dtype: object
-----
Index: 1
StudentID                      PH1002
FullName                Fatima Akhter
Data Structure Marks             97.0
Algorithm Marks                  92.0
Python Marks                      NaN
CompletionStatus          In Progress
Instructor                  Ms. Salma
Location                   Chattogram
Name: 1, dtype: object
-----
Index: 2
StudentID                      PH1003
FullName                Imran Hossain
Data Structure Marks             93.0
Algorithm Marks                  88.0
Python Marks                     85.0
CompletionStatus            Completed
Instructor                  Mr. Karim
Location                        Dhaka
Nam

## Sorting a DF

In [None]:
copy_df = df.sort_values(by='Algorithm Marks', ascending=False)  # Sort by 'Algorithm Marks' in descending order
copy_df

# multiple sorts
copy_df_multi = df.sort_values(by=['Algorithm Marks', 'Data Structure Marks'], ascending=[False, True])  # Sort by 'Algorithm Marks' descending, then 'Data Structure Marks' ascending if tie

copy_df_multi = df.sort_values(by=['Algorithm Marks', 'Data Structure Marks'], ascending=[0,1])  # Sort by 'Algorithm Marks' descending, then 'Data Structure Marks' ascending if tie

Unnamed: 0,StudentID,FullName,Data Structure Marks,Algorithm Marks,Python Marks,CompletionStatus,Instructor,Location
15,PH1016,Ziaur Rahman,94.0,94.0,,In Progress,Ms. Salma,Chattogram
1,PH1002,Fatima Akhter,97.0,92.0,,In Progress,Ms. Salma,Chattogram
16,PH1017,Afsana Mimi,90.0,90.0,93.0,Completed,Mr. Karim,Dhaka
9,PH1010,Priya Sharma,89.0,89.0,88.0,Completed,Ms. Salma,Sylhet
17,PH1018,Babul Ahmed,88.0,88.0,85.0,Completed,Ms. Salma,Sylhet
2,PH1003,Imran Hossain,93.0,88.0,85.0,Completed,Mr. Karim,Dhaka
19,PH1020,Nasir Khan,86.0,86.0,89.0,Completed,Ms. Salma,Dhaka
14,PH1015,Wahiduzzaman,86.0,86.0,84.0,Completed,Mr. Karim,Dhaka
11,PH1012,Sadia Chowdhury,85.0,85.0,87.0,Completed,Ms. Salma,Chattogram
0,PH1001,Alice Johnson,85.0,85.0,88.0,Completed,Mr. Karim,Dhaka


## Filtering Data based on condition


In [None]:

# Filter DataFrame for students with 'Not Started' status in 'Course CompletionStatus' column & located in 'Chattogram' in 'Location' column, only get 'StudentID' and 'FullName' columns
not_started_students = df.loc[(df['CompletionStatus'] == 'Not Started') & (df['Location'] == 'Chattogram'), ['StudentID', 'FullName']]
not_started_students

# Core filtering syntax
# df.loc[df['Column_Name'] == condition]. Use &, | for multiple conditions

Unnamed: 0,StudentID,FullName
18,PH1019,Faria Rahman
