<a href="https://colab.research.google.com/github/hannahph4m/bus4-118/blob/main/Data_Manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Sample raw dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', None, 'Ethan'],
    'Age': [23, np.nan, 22, 25, 24],
    'Score': ['85', '90', None, '88', '92'],
    'City': ['New York', 'Los Angeles', None, 'Chicago', 'Houston']
}

df = pd.DataFrame(data)

print(" Raw Data:")
print(df)

# ---- 1. Handling Missing Values ----

# Drop rows with any missing values
df_dropped = df.dropna()
print("\n Dropped Rows with Any Missing Values:")
print(df_dropped)

# Fill missing values with a default value
df_filled = df.fillna({'Name': 'Unknown', 'Age': df['Age'].mean(), 'Score': '0', 'City': 'Unknown'})
print("\n Filled Missing Values:")
print(df_filled)

# ---- 2. Data Type Conversion ----

# Convert 'Score' column from string to integer
df_filled['Score'] = df_filled['Score'].astype(int)

print("\n After Converting 'Score' to Integer:")
print(df_filled.dtypes)

# ---- 3. Data Filtering ----

# Filter rows where Age is greater than 23
filtered_df = df_filled[df_filled['Age'] > 23]
print("\n Filtered Rows (Age > 23):")
print(filtered_df)

# ---- 4. Data Transformation ----

# Add a new column with upper-case city names
df_filled['City_Upper'] = df_filled['City'].str.upper()

# Add a calculated column: Age * Score
df_filled['Performance'] = df_filled['Age'] * df_filled['Score']

print("\n Transformed Data:")
print(df_filled)

 Raw Data:
      Name   Age Score         City
0    Alice  23.0    85     New York
1      Bob   NaN    90  Los Angeles
2  Charlie  22.0  None         None
3     None  25.0    88      Chicago
4    Ethan  24.0    92      Houston

 Dropped Rows with Any Missing Values:
    Name   Age Score      City
0  Alice  23.0    85  New York
4  Ethan  24.0    92   Houston

 Filled Missing Values:
      Name   Age Score         City
0    Alice  23.0    85     New York
1      Bob  23.5    90  Los Angeles
2  Charlie  22.0     0      Unknown
3  Unknown  25.0    88      Chicago
4    Ethan  24.0    92      Houston

 After Converting 'Score' to Integer:
Name      object
Age      float64
Score      int64
City      object
dtype: object

 Filtered Rows (Age > 23):
      Name   Age  Score         City
1      Bob  23.5     90  Los Angeles
3  Unknown  25.0     88      Chicago
4    Ethan  24.0     92      Houston

 Transformed Data:
      Name   Age  Score         City   City_Upper  Performance
0    Alice  23.0   