**Drop irelevant columns**

In [54]:
import pandas as pd

In [55]:
df = pd.read_csv("students_large_dataset.csv")
print(df.head())

   student_id     name department  semester  cgpa  attendance_percent
0        1001   Karima        ICE         5  2.49                  68
1        1002   Karima         CE         2   NaN                  62
2        1003   Arafat        ICE         4   NaN                  98
3        1004   Arafat         CE         4   NaN                  94
4        1005  Sumaiya        CSE         8  3.18                  60


In [36]:
# drop attendance_percent column
df = df.drop(columns=["attendance_percent"])
print(df.head())

   student_id     name department  semester  cgpa
0        1001   Karima        ICE         5  2.49
1        1002   Karima         CE         2   NaN
2        1003   Arafat        ICE         4   NaN
3        1004   Arafat         CE         4   NaN
4        1005  Sumaiya        CSE         8  3.18


**Handle Missing Values**

In [None]:
"""
op1 : delete rows with missing values
op2 : fill missing values with a specific value
"""

In [38]:
# op1
df = df.dropna(subset=["cgpa"])
print(df.head())

   student_id     name department  semester  cgpa
0        1001   Karima        ICE         5  2.49
4        1005  Sumaiya        CSE         8  3.18
5        1006    Tanim         ME         6  2.56
6        1007    Sadia        ICE         2  2.76
7        1008   Nayeem         CE         5  3.61


In [None]:
# op2
df["cgpa"] = df["cgpa"].fillna("None")
print(df.head())

   student_id     name department  semester  cgpa
0        1001   Karima        ICE         5  2.49
1        1002   Karima         CE         2  None
2        1003   Arafat        ICE         4  None
3        1004   Arafat         CE         4  None
4        1005  Sumaiya        CSE         8  3.18


**Fix inconsistent data**

In [57]:
df["name"] = df["name"].replace({"Karima" : "abcs"})
print(df.head())

   student_id     name department  semester  cgpa  attendance_percent
0        1001     abcs        ICE         5  2.49                  68
1        1002     abcs         CE         2   NaN                  62
2        1003   Arafat        ICE         4   NaN                  98
3        1004   Arafat         CE         4   NaN                  94
4        1005  Sumaiya        CSE         8  3.18                  60


**Standardize text**

In [None]:
"""
Standardize text means to convert all the text to the same case (lowercase or uppercase).
"""

In [None]:

df["name"] = df["name"].str.lower()
print(df.head())

   student_id     name department  semester  cgpa  attendance_percent
0        1001     abcs        ICE         5  2.49                  68
1        1002     abcs         CE         2   NaN                  62
2        1003   arafat        ICE         4   NaN                  98
3        1004   arafat         CE         4   NaN                  94
4        1005  sumaiya        CSE         8  3.18                  60


**Fix data types**

In [None]:
"""
Fix data types: somtimes the data types of the columns are not correct.
For example, the column "age" should be of type int, but it is of type object.
also sometimes pandas retuen for string "True" and "False" instead of boolean values.
"""

In [80]:
df = pd.read_csv("small_dataset.csv")
print(df.head())

   student_id     name department  semester  cgpa  attendance_percent  hostel
0        1001   Karima        ICE         5  2.49                  68   False
1        1002   Karima         CE         2  3.18                  62    True
2        1003   Arafat        ICE         4  2.47                  98   False
3        1004   Arafat         CE         4  3.43                  94    True
4        1005  Sumaiya        CSE         8  3.18                  60   False


In [74]:
df.dtypes

student_id              int64
name                   object
department             object
semester                int64
cgpa                  float64
attendance_percent      int64
hostel                   bool
dtype: object

In [None]:
"""
pandas may auto-detect boolean columns, but this is unreliable for real-world CSVs.
To ensure correctness, fix boolean-like data explicitly using logic-based mapping.
"""

In [75]:
# fix data type ...
df["hostel"] = df["hostel"].map({"True" : True, "False" : False})

**Remove duplicate**

In [87]:
df = pd.read_csv("small_dataset.csv")
print("Before removing duplicates:")
print(df.head())
print()
df = df.drop_duplicates(["name"])
print("After removing duplicates:")
print(df.head())

Before removing duplicates:
   student_id     name department  semester  cgpa  attendance_percent  hostel
0        1001   Karima        ICE         5  2.49                  68   False
1        1002   Karima         CE         2  3.18                  62    True
2        1003   Arafat        ICE         4  2.47                  98   False
3        1004   Arafat         CE         4  3.43                  94    True
4        1005  Sumaiya        CSE         8  3.18                  60   False

After removing duplicates:
   student_id     name department  semester  cgpa  attendance_percent  hostel
0        1001   Karima        ICE         5  2.49                  68   False
2        1003   Arafat        ICE         4  2.47                  98   False
4        1005  Sumaiya        CSE         8  3.18                  60   False
5        1006     Rafi        ICE         1  3.65                  85    True
6        1007   Nabila        EEE         3  3.02                  72   False
