In [1]:
import pandas as pd

data = {
    'Name': ['Katrín', 'Armando', 'Luna', 'Katrín', 'Carlos', None],
    'Age': [19, 21, 20, None, 23, 22],
    'Email': ['katrin@ie.edu', 'armando@gmail.com', None, 'katrin@ie.edu', 'carlos@ie.edu', ''],
    'Major': ['Math', 'CS', 'Physics', 'Math', 'CS', 'Physics'],
    'Grade': [95, 88, 91, 95, None, 87]
}

df = pd.DataFrame(data)
print(df)

      Name   Age              Email    Major  Grade
0   Katrín  19.0      katrin@ie.edu     Math   95.0
1  Armando  21.0  armando@gmail.com       CS   88.0
2     Luna  20.0               None  Physics   91.0
3   Katrín   NaN      katrin@ie.edu     Math   95.0
4   Carlos  23.0      carlos@ie.edu       CS    NaN
5     None  22.0                     Physics   87.0


In [4]:
# 🧼 1. Data Cleaning
# 🎯 Task:
# Drop duplicate rows
# Replace empty strings and None in Email with "missing@email.com"
df = df.drop_duplicates()
df['Email'] = df['Email'].replace(['', None], 'missing@email.com')
print(df)

      Name   Age              Email    Major  Grade
0   Katrín  19.0      katrin@ie.edu     Math   95.0
1  Armando  21.0  armando@gmail.com       CS   88.0
2     Luna  20.0  missing@email.com  Physics   91.0
3   Katrín   NaN      katrin@ie.edu     Math   95.0
4   Carlos  23.0      carlos@ie.edu       CS    NaN
5     None  22.0  missing@email.com  Physics   87.0


In [5]:
# 🎯 2. Use .loc[] to filter and update
# Task:
# Find all rows where Name == 'Katrín'
# Set their Grade to 100
df.loc[df['Name'] == 'Katrín', 'Grade'] = 100
print(df)

      Name   Age              Email    Major  Grade
0   Katrín  19.0      katrin@ie.edu     Math  100.0
1  Armando  21.0  armando@gmail.com       CS   88.0
2     Luna  20.0  missing@email.com  Physics   91.0
3   Katrín   NaN      katrin@ie.edu     Math  100.0
4   Carlos  23.0      carlos@ie.edu       CS    NaN
5     None  22.0  missing@email.com  Physics   87.0


In [7]:
# 🔢 3. Use .iloc[] to access rows and columns by position
# Task:
# Get the first three rows and only the Name and Email columns
df.iloc[0:3, [0,2]]

Unnamed: 0,Name,Email
0,Katrín,katrin@ie.edu
1,Armando,armando@gmail.com
2,Luna,missing@email.com


In [11]:
# ❓ 4. Find and Fill Missing Data
# Task:
# Find which columns have missing values
# Fill missing Age values with the average age
print(df.isnull().sum())
df['Age']=df['Age'].fillna(df['Age'].mean())
print(df)

Name     1
Age      1
Email    0
Major    0
Grade    1
dtype: int64
      Name   Age              Email    Major  Grade
0   Katrín  19.0      katrin@ie.edu     Math  100.0
1  Armando  21.0  armando@gmail.com       CS   88.0
2     Luna  20.0  missing@email.com  Physics   91.0
3   Katrín  21.0      katrin@ie.edu     Math  100.0
4   Carlos  23.0      carlos@ie.edu       CS    NaN
5     None  22.0  missing@email.com  Physics   87.0


In [15]:
# 🎯 5. Boolean Filtering
# Task:
# Find students with a Grade >= 90
# Return only Name and Grade
print(df[df['Grade'] >= 90][['Name', 'Grade']])

     Name  Grade
0  Katrín  100.0
2    Luna   91.0
3  Katrín  100.0


In [16]:
# 🔢 6. Sorting
# Task:
# Sort the DataFrame by Grade descending
df_sorted = df.sort_values(by='Grade', ascending=False)
print(df_sorted)

      Name   Age              Email    Major  Grade
0   Katrín  19.0      katrin@ie.edu     Math  100.0
3   Katrín  21.0      katrin@ie.edu     Math  100.0
2     Luna  20.0  missing@email.com  Physics   91.0
1  Armando  21.0  armando@gmail.com       CS   88.0
5     None  22.0  missing@email.com  Physics   87.0
4   Carlos  23.0      carlos@ie.edu       CS    NaN


In [17]:
# 👯 7. GroupBy + Aggregation
# Task:
# Calculate the average Grade for each Major
grouped = df.groupby('Major')['Grade'].mean()
print(grouped)

Major
CS          88.0
Math       100.0
Physics     89.0
Name: Grade, dtype: float64


In [18]:
# 📈 8. Create a New Column
# Task:
# Add a column Passed that says True if Grade >= 90, else False
df['Passed'] = df['Grade'] >= 90
print(df)

      Name   Age              Email    Major  Grade  Passed
0   Katrín  19.0      katrin@ie.edu     Math  100.0    True
1  Armando  21.0  armando@gmail.com       CS   88.0   False
2     Luna  20.0  missing@email.com  Physics   91.0    True
3   Katrín  21.0      katrin@ie.edu     Math  100.0    True
4   Carlos  23.0      carlos@ie.edu       CS    NaN   False
5     None  22.0  missing@email.com  Physics   87.0   False


In [19]:
# 🧪 9. Apply a Custom Function
# Task:
# Create a new column Grade_Level:

# "High" if grade ≥ 90

# "Mid" if 80–89

# "Low" otherwise

def grade_level(g):
    if g >= 90:
        return 'High'
    elif 80<=g<=89:
        return 'Mid'
    else:
        return 'Low'
    
df['Grade_Level']=df['Grade'].apply(grade_level)
print(df)

      Name   Age              Email    Major  Grade  Passed Grade_Level
0   Katrín  19.0      katrin@ie.edu     Math  100.0    True        High
1  Armando  21.0  armando@gmail.com       CS   88.0   False         Mid
2     Luna  20.0  missing@email.com  Physics   91.0    True        High
3   Katrín  21.0      katrin@ie.edu     Math  100.0    True        High
4   Carlos  23.0      carlos@ie.edu       CS    NaN   False         Low
5     None  22.0  missing@email.com  Physics   87.0   False         Mid


In [20]:
# 🔄 10. Replace Values
# Task:
# Replace "CS" in the Major column with "Computer Science"

df['Major'] = df['Major'].replace('CS', 'Computer Science')
print(df)

      Name   Age              Email             Major  Grade  Passed  \
0   Katrín  19.0      katrin@ie.edu              Math  100.0    True   
1  Armando  21.0  armando@gmail.com  Computer Science   88.0   False   
2     Luna  20.0  missing@email.com           Physics   91.0    True   
3   Katrín  21.0      katrin@ie.edu              Math  100.0    True   
4   Carlos  23.0      carlos@ie.edu  Computer Science    NaN   False   
5     None  22.0  missing@email.com           Physics   87.0   False   

  Grade_Level  
0        High  
1         Mid  
2        High  
3        High  
4         Low  
5         Mid  


In [None]:
# 🧪 11. Drop Rows Based on a Condition
# 🎯 Task:
# Drop all rows where Grade is below 60 (you only want to keep passing students)

to_drop = df[df['Grade'] < 60].index
df = df.drop(to_drop)
#or:
#df = df[df['Grade'] >= 60]
print(df)

In [30]:
# 🧪 14. Chained Filtering
# 🎯 Task:
# Find all students who are in "Math" and have a Grade above 90

df[(df['Major'] == 'Math') & (df['Grade'] > 90)]

Unnamed: 0,Name,Age,Email,Major,Grade,Passed,Grade_Level
0,Katrín,19.0,katrin@ie.edu,Math,100.0,True,High
3,Katrín,21.0,katrin@ie.edu,Math,100.0,True,High


In [34]:
# 🧪 15. Rename Columns
# 🎯 Task:
# Change "Grade_Level" to "Performance"
df.rename(columns={'Grade_Level':'Performance'}, inplace = True)
print(df)

      Name   Age              Email             Major  Grade  Passed  \
0   Katrín  19.0      katrin@ie.edu              Math  100.0    True   
1  Armando  21.0  armando@gmail.com  Computer Science   88.0   False   
2     Luna  20.0  missing@email.com           Physics   91.0    True   
3   Katrín  21.0      katrin@ie.edu              Math  100.0    True   
4   Carlos  23.0      carlos@ie.edu  Computer Science    NaN   False   
5     None  22.0  missing@email.com           Physics   87.0   False   

  Performance  
0        High  
1         Mid  
2        High  
3        High  
4         Low  
5         Mid  


In [36]:
# 🧪 16. Set Index
# 🎯 Task:
# Make the "Email" column the new index
df.set_index('Email', inplace = True)
print(df)

                      Name   Age             Major  Grade  Passed Performance
Email                                                                        
katrin@ie.edu       Katrín  19.0              Math  100.0    True        High
armando@gmail.com  Armando  21.0  Computer Science   88.0   False         Mid
missing@email.com     Luna  20.0           Physics   91.0    True        High
katrin@ie.edu       Katrín  21.0              Math  100.0    True        High
carlos@ie.edu       Carlos  23.0  Computer Science    NaN   False         Low
missing@email.com     None  22.0           Physics   87.0   False         Mid


In [38]:
df = df.reset_index()
print(df)

               Email     Name   Age             Major  Grade  Passed  \
0      katrin@ie.edu   Katrín  19.0              Math  100.0    True   
1  armando@gmail.com  Armando  21.0  Computer Science   88.0   False   
2  missing@email.com     Luna  20.0           Physics   91.0    True   
3      katrin@ie.edu   Katrín  21.0              Math  100.0    True   
4      carlos@ie.edu   Carlos  23.0  Computer Science    NaN   False   
5  missing@email.com     None  22.0           Physics   87.0   False   

  Performance  
0        High  
1         Mid  
2        High  
3        High  
4         Low  
5         Mid  


In [40]:
df.loc[3, 'Age']=19.0
print(df)

               Email     Name   Age             Major  Grade  Passed  \
0      katrin@ie.edu   Katrín  19.0              Math  100.0    True   
1  armando@gmail.com  Armando  21.0  Computer Science   88.0   False   
2  missing@email.com     Luna  20.0           Physics   91.0    True   
3      katrin@ie.edu   Katrín  19.0              Math  100.0    True   
4      carlos@ie.edu   Carlos  23.0  Computer Science    NaN   False   
5  missing@email.com     None  22.0           Physics   87.0   False   

  Performance  
0        High  
1         Mid  
2        High  
3        High  
4         Low  
5         Mid  


In [44]:
# 🧪 17. Find Duplicates
# 🎯 Task:
# Check if any rows are duplicates (again), and drop them

duplicates = df.duplicated()
print(duplicates.sum())

df = df.drop_duplicates()
print(df)

0
               Email     Name   Age             Major  Grade  Passed  \
0      katrin@ie.edu   Katrín  19.0              Math  100.0    True   
1  armando@gmail.com  Armando  21.0  Computer Science   88.0   False   
2  missing@email.com     Luna  20.0           Physics   91.0    True   
4      carlos@ie.edu   Carlos  23.0  Computer Science    NaN   False   
5  missing@email.com     None  22.0           Physics   87.0   False   

  Performance  
0        High  
1         Mid  
2        High  
4         Low  
5         Mid  


In [46]:
# 🧪 18. Binning (Creating Ranges)
# 🎯 Task:
# Create a column Age_Group that classifies students as:

# "Teen" if age < 20

# "Young Adult" if 20–22

# "Adult" if 23+

def age_bin(a):
    if a < 20:
        return 'Teen'
    elif 20<=a<=22:
        return 'Young Adult'
    else:
        return 'Adult'
    
df['Age_Group'] = df['Age'].apply(age_bin)

print(df)

               Email     Name   Age             Major  Grade  Passed  \
0      katrin@ie.edu   Katrín  19.0              Math  100.0    True   
1  armando@gmail.com  Armando  21.0  Computer Science   88.0   False   
2  missing@email.com     Luna  20.0           Physics   91.0    True   
4      carlos@ie.edu   Carlos  23.0  Computer Science    NaN   False   
5  missing@email.com     None  22.0           Physics   87.0   False   

  Performance    Age_Group  
0        High         Teen  
1         Mid  Young Adult  
2        High  Young Adult  
4         Low        Adult  
5         Mid  Young Adult  


In [52]:
# 🧪 19. Pivot Table
# 🎯 Task:
# Make a pivot table that shows average grade by Age Group and Major

grouped = df.groupby(['Age_Group', 'Major'])['Grade'].mean()
print(grouped)
print('\n')
pivot = pd.pivot_table(df, values='Grade', index='Age_Group', columns='Major', aggfunc='mean')
print(pivot)


Age_Group    Major           
Adult        Computer Science      NaN
Teen         Math                100.0
Young Adult  Computer Science     88.0
             Physics              89.0
Name: Grade, dtype: float64


Major        Computer Science   Math  Physics
Age_Group                                    
Teen                      NaN  100.0      NaN
Young Adult              88.0    NaN     89.0


In [None]:
# 🧪 20. Export to CSV
# 🎯 Task:
# Save your cleaned dataset to a CSV

df.to_csv('pandasdata.csv', index = False)