#Day 6: Missing Values (NaN)
Pandas Course

In [1]:
import pandas as pd
import numpy as np

data = {
    "name": ["Ahmed", "Sara", "Omar", "Lina"],
    "math": [90, np.nan, 85, 70],
    "english": [88, 92, np.nan, 75],
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0
1,Sara,,92.0
2,Omar,85.0,
3,Lina,70.0,75.0


In [3]:
df.isna().sum()

name       0
math       1
english    1
dtype: int64

In [14]:
data = {
    "name": ["  ahmed  ", "Sara", "Omar", "lina"],
    "city": ["cairo", "Giza", "alex", "Cairo"]
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,city
0,ahmed,cairo
1,Sara,Giza
2,Omar,alex
3,lina,Cairo


In [12]:
df["name"].str.strip()

0    ahmed
1     Sara
2     Omar
3     lina
Name: name, dtype: str

In [15]:
df["name"].str.lower()

0      ahmed  
1         sara
2         omar
3         lina
Name: name, dtype: str

In [16]:
df["name"].str.upper()

0      AHMED  
1         SARA
2         OMAR
3         LINA
Name: name, dtype: str

In [17]:
df["name"].str.contains("a")

0    True
1    True
2    True
3    True
Name: name, dtype: bool

In [18]:
df["city"] = df["city"].str.lower()
df

Unnamed: 0,name,city
0,ahmed,cairo
1,Sara,giza
2,Omar,alex
3,lina,cairo


In [19]:
df["name"] = df["name"].str.strip().str.lower()
df

Unnamed: 0,name,city
0,ahmed,cairo
1,sara,giza
2,omar,alex
3,lina,cairo


#Day 8: Filtering & Conditions in Pandas

In [1]:
import pandas as pd
data = {
    "name": ["Ahmed", "Sara", "Omar", "Lina"],
    "math": [90, 85, 85, 70],
    "english": [88, 92, 85, 75],
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,math,english
0,Ahmed,90,88
1,Sara,85,92
2,Omar,85,85
3,Lina,70,75


In [2]:
df[df["math"] > 80]

Unnamed: 0,name,math,english
0,Ahmed,90,88
1,Sara,85,92
2,Omar,85,85


In [3]:
df[df["name"] =="Ahmed"]

Unnamed: 0,name,math,english
0,Ahmed,90,88


In [4]:
df[(df["math"] > 80) & (df["english"] > 85)]

Unnamed: 0,name,math,english
0,Ahmed,90,88
1,Sara,85,92


In [5]:
df[(df["math"] > 70) | (df["english"] > 80)]

Unnamed: 0,name,math,english
0,Ahmed,90,88
1,Sara,85,92
2,Omar,85,85


In [6]:
df[df["math"].isin(["Ahmed", "Lina"])]

Unnamed: 0,name,math,english


In [7]:
df[df["math"] > 70].sort_values(by="math",ascending=False)

Unnamed: 0,name,math,english
0,Ahmed,90,88
1,Sara,85,92
2,Omar,85,85


In [8]:
df[df["english"] >=80]

Unnamed: 0,name,math,english
0,Ahmed,90,88
1,Sara,85,92
2,Omar,85,85


In [13]:

df[df["english"] >=80].sort_values(by="english",ascending=False)

Unnamed: 0,name,math,english
1,Sara,85,92
0,Ahmed,90,88
2,Omar,85,85


In [14]:
df.sort_values(by="english",ascending=False)

Unnamed: 0,name,math,english
1,Sara,85,92
0,Ahmed,90,88
2,Omar,85,85
3,Lina,70,75


#Day 9: GroupBy

In [17]:
import pandas as pd
data = {
    "student": ["Ahmed", "Sara", "Omar", "Lina", "Mona"],
    "department": ["CS", "IT", "CS", "IT", "CS"],
    "score": [90, 85, 88, 92, 95]
}
df = pd.DataFrame(data)
df

Unnamed: 0,student,department,score
0,Ahmed,CS,90
1,Sara,IT,85
2,Omar,CS,88
3,Lina,IT,92
4,Mona,CS,95


In [18]:
df.groupby("department")["score"].mean()

department
CS    91.0
IT    88.5
Name: score, dtype: float64

In [19]:
df.groupby("department")["score"].agg(['mean', 'max', 'min','count'])

Unnamed: 0_level_0,mean,max,min,count
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CS,91.0,95,88,3
IT,88.5,92,85,2


In [20]:
df.groupby(["department", "student"])["score"].mean()

department  student
CS          Ahmed      90.0
            Mona       95.0
            Omar       88.0
IT          Lina       92.0
            Sara       85.0
Name: score, dtype: float64

In [22]:
result = df.groupby("department")["score"].mean().reset_index()
result

Unnamed: 0,department,score
0,CS,91.0
1,IT,88.5


Day 10: Handling Missing Data(NaN) in pandas

In [4]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "name": ["Ahmed", "Sara", "Omar"],
    "math": [90, np.nan, 85],
    "english": [88, 92, np.nan],
})
df

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0
1,Sara,,92.0
2,Omar,85.0,


In [2]:
df.isna().sum()

name       0
math       1
english    1
dtype: int64

In [17]:
df

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0
1,Sara,,92.0
2,Omar,85.0,


remove rows that have NaN = keep the columns that have not NaN

In [18]:
df.dropna(axis=0)

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0


In [16]:
df

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0
1,Sara,,92.0
2,Omar,85.0,


In [5]:
df.dropna(axis=1)

Unnamed: 0,name
0,Ahmed
1,Sara
2,Omar


In [6]:
df.fillna(0) 

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0
1,Sara,0.0,92.0
2,Omar,85.0,0.0


In [8]:
df

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0
1,Sara,,92.0
2,Omar,85.0,


In [7]:
df["math"].fillna(df["math"].mean())

0    90.0
1    87.5
2    85.0
Name: math, dtype: float64

In [10]:
df.fillna(df.mean(numeric_only=True))

Unnamed: 0,name,math,english
0,Ahmed,90.0,88.0
1,Sara,87.5,92.0
2,Omar,85.0,90.0


calculate the mean of numerical columns only

In [20]:
clean_df = df.fillna(df.mean(numeric_only=True))
clean_df.mean(numeric_only=True)

math       87.5
english    90.0
dtype: float64