In [1]:
import pandas as pd


data = {
    'Name': ['Alice','Bob','Charlie','David','Eva'],
    'Age': [25,30,35,40,22],
    'City': ['NY','LA','NY','Chicago','LA'],
    'Salary': [50000,60000,75000,80000,45000]
}

df = pd.DataFrame(data)

print(df)

      Name  Age     City  Salary
0    Alice   25       NY   50000
1      Bob   30       LA   60000
2  Charlie   35       NY   75000
3    David   40  Chicago   80000
4      Eva   22       LA   45000


In [3]:
print(df.head())
print(" ")
print(df.info())
print(" ")
print(df.describe())

      Name  Age     City  Salary
0    Alice   25       NY   50000
1      Bob   30       LA   60000
2  Charlie   35       NY   75000
3    David   40  Chicago   80000
4      Eva   22       LA   45000
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
 3   Salary  5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes
None
 
             Age        Salary
count   5.000000      5.000000
mean   30.400000  62000.000000
std     7.300685  15247.950682
min    22.000000  45000.000000
25%    25.000000  50000.000000
50%    30.000000  60000.000000
75%    35.000000  75000.000000
max    40.000000  80000.000000


In [7]:
print("Names:\n", df['Name'])
print("Name & Salary:\n", df[['Name','Salary']])

print(" ")
print("Age > 30:\n", df[df['Age'] > 30])


Names:
 0      Alice
1        Bob
2    Charlie
3      David
4        Eva
Name: Name, dtype: object
Name & Salary:
       Name  Salary
0    Alice   50000
1      Bob   60000
2  Charlie   75000
3    David   80000
4      Eva   45000
 
Age > 30:
       Name  Age     City  Salary
2  Charlie   35       NY   75000
3    David   40  Chicago   80000


In [10]:
print("Average Salary per City:\n", df.groupby('City')['Salary'].mean())

Average Salary per City:
 City
Chicago    80000.0
LA         52500.0
NY         62500.0
Name: Salary, dtype: float64


In [11]:
print("Sorted by Age:\n", df.sort_values(by='Age'))

Sorted by Age:
       Name  Age     City  Salary
4      Eva   22       LA   45000
0    Alice   25       NY   50000
1      Bob   30       LA   60000
2  Charlie   35       NY   75000
3    David   40  Chicago   80000


In [12]:
df['Tax'] = df['Salary'] * 0.2
print(df)

      Name  Age     City  Salary      Tax
0    Alice   25       NY   50000  10000.0
1      Bob   30       LA   60000  12000.0
2  Charlie   35       NY   75000  15000.0
3    David   40  Chicago   80000  16000.0
4      Eva   22       LA   45000   9000.0


In [14]:
# 8. Merging (tiny example)
bonus = pd.DataFrame({
    'Name': ['Alice','Bob','Eva'],
    'Bonus': [5000,4000,3000]
})

merged = pd.merge(df, bonus, on='Name', how='left')
print("Merged Data:\n", merged)

Merged Data:
       Name  Age     City  Salary      Tax   Bonus
0    Alice   25       NY   50000  10000.0  5000.0
1      Bob   30       LA   60000  12000.0  4000.0
2  Charlie   35       NY   75000  15000.0     NaN
3    David   40  Chicago   80000  16000.0     NaN
4      Eva   22       LA   45000   9000.0  3000.0


In [16]:
# Task 1: Create a DataFrame of students and add an 'Average' column.
data = {'Name': ['Aman', 'Bhavna', 'Chetan', 'Diya'],
        'Marks1': [85, 92, 78, 88],
        'Marks2': [90, 85, 80, 95],
        'Marks3': [75, 90, 85, 82]}

df_students = pd.DataFrame(data)

# Add the 'Average' column
df_students['Average'] = df_students[['Marks1', 'Marks2', 'Marks3']].mean(axis=1)

print("Student DataFrame with Average:")
print(df_students)
print("-" * 30)

Student DataFrame with Average:
     Name  Marks1  Marks2  Marks3    Average
0    Aman      85      90      75  83.333333
1  Bhavna      92      85      90  89.000000
2  Chetan      78      80      85  81.000000
3    Diya      88      95      82  88.333333
------------------------------


In [17]:
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
df_titanic = pd.read_csv(url)

print("First 5 rows of the Titanic DataFrame:")
print(df_titanic.head())
print("-" * 30)

print("Shape of the Titanic DataFrame:")
print(df_titanic.shape)
print("-" * 30)

First 5 rows of the Titanic DataFrame:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
------------------------------
Shape of the Titanic DataFrame:
(891, 15)
------------------------------


In [18]:
average_age_by_class = df_titanic.groupby('pclass')['age'].mean()

print("Average age per passenger class:")
print(average_age_by_class)
print("-" * 30)


Average age per passenger class:
pclass
1    38.233441
2    29.877630
3    25.140620
Name: age, dtype: float64
------------------------------


In [19]:
# Task 4: Count how many survived (survived=1) per gender.
survived_by_gender = df_titanic[df_titanic['survived'] == 1].groupby('sex')['survived'].count()

print("Number of survivors per gender:")
print(survived_by_gender)
print("-" * 30)


Number of survivors per gender:
sex
female    233
male      109
Name: survived, dtype: int64
------------------------------


In [None]:
median_age = df_titanic['age'].median()

# Use direct assignment to fill missing values and avoid the FutureWaring
df_titanic['age'] = df_titanic['age'].fillna(median_age)

print("Missing values in 'age' column after filling with median:")
print(df_titanic['age'].isnull().sum())
print("-" * 30)

Missing values in 'age' column after filling with median:
0
------------------------------
