In [17]:
import numpy as np
import pandas as pd


np.random.seed(42)
n_rows = 100

data = {
    'age': np.random.randint(15, 50, size=n_rows),
    'salary': np.random.randint(13000, 46000, size=n_rows),
    'increment': np.random.choice([6, 7, 8, 9], size=n_rows)
}

df = pd.DataFrame(data)
print(df.head())


   age  salary  increment
0   43   19910          8
1   29   38446          8
2   22   13206          6
3   35   34518          6
4   33   35361          8


In [18]:
df.loc[np.random.choice(df.index, 4, replace=False), "salary"] = np.nan
df.loc[np.random.choice(df.index,7, replace=False), "Age"] = np.nan
print("Synthetic Dataset with NaN values:\n")
print(df.head(12))

Synthetic Dataset with NaN values:

    age   salary  increment  Age
0    43  19910.0          8  NaN
1    29  38446.0          8  NaN
2    22  13206.0          6  NaN
3    35  34518.0          6  NaN
4    33  35361.0          8  NaN
5    37  36419.0          8  NaN
6    25  35403.0          9  NaN
7    25  31141.0          9  NaN
8    38  27820.0          8  NaN
9    38  19374.0          9  NaN
10   17  14678.0          9  NaN
11   36  29198.0          7  NaN


In [19]:
mean_salary = df["salary"].mean(skipna=True)   # default skipna=True
median_salary = df["salary"].median(skipna=True)
mask = df["salary"].notna() & df["Age"].notna()
age_weighted_mean = (
    (df.loc[mask, "salary"] * df.loc[mask, "Age"]).sum()
    / df.loc[mask, "Age"].sum()
)

  (df.loc[mask, "salary"] * df.loc[mask, "Age"]).sum()


In [None]:
print("\nResults:")
print(f"Mean salary = {mean_salary:.2f}")
print(f"Median Income = {median_salary:.2f}")
print(f"Age-Weighted Mean salary = {age_weighted_mean:.2f}")


Results:
Mean salary = 28869.01
Median Income = 30581.00
Age-Weighted Mean salary = nan


:Q Explain when weighted mean is preferable.
Ans:Use weighted mean when some data should matter more than others.


Problem 2: Standardize income (z-score). Report how many incomes are outliers using rule |z|
> 3. Handle NaNs correctly (do not drop entire rows unnecessarily).


In [24]:
# Z-score standardization
mean_salary = df["salary"].mean(skipna=True)
std_salary = df["salary"].std(skipna=True)
# Compute z-scores for Income
df["salary_z"] = (df["salary"] - mean_salary) / std_salary
# Identify outliers using |z| > 3
outliers = df[(df["salary_z"].abs() > 3)]

In [25]:
print("\nStandardized Income (z-scores):\n")
print(df[[ "salary", "salary_z"]].head(12))

print(f"\nNumber of outliers = {outliers.shape[0]}")
print("\nOutlier Rows:\n")
print(outliers)


Standardized Income (z-scores):

     salary  salary_z
0   19910.0 -1.036510
1   38446.0  0.812744
2   13206.0 -1.705338
3   34518.0  0.420865
4   35361.0  0.504967
5   36419.0  0.610519
6   35403.0  0.509158
7   31141.0  0.083957
8   27820.0 -0.247365
9   19374.0 -1.089984
10  14678.0 -1.558483
11  29198.0 -0.109888

Number of outliers = 0

Outlier Rows:

Empty DataFrame
Columns: [age, salary, increment, Age, Age_Bin, salary_z]
Index: []


Problem 3: Create age bins: [18-25), [25-35), [35-45), [45-60) and compute for each bin:
● count of observations,
● mean income,
● median score.
Show result as a tidy DataFrame sorted by age bin.


In [26]:
bins = [13, 25, 34, 45, 55]
labels = ["15-25", "25-34", "34-45", "45-55"]

df["Age_Bin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)
result = df.groupby("Age_Bin").agg(
    Count=("salary", "count"),
    Mean_Income=("salary", "mean"),
    Median_Income=("salary", "median")
).reset_index()
result = result.sort_values("Age_Bin").reset_index(drop=True)

  result = df.groupby("Age_Bin").agg(


In [27]:
print("\nResult by Age Bin:\n")
print(result)


Result by Age Bin:

  Age_Bin  Count  Mean_Income  Median_Income
0   15-25      0          NaN            NaN
1   25-34      0          NaN            NaN
2   34-45      0          NaN            NaN
3   45-55      0          NaN            NaN


Problem 4: Create an array it cannot be of 1 Dimension. And then showcase the operation for
the following:
● Shape and Resize → shape, size, Transpose, Flatten
● Showcasing negative indexing and display error while doing slicing
● Arithmetic Operations → Broadcasting, Dot Product
● Linear Algebra → Determinant, Inverse
                                                                                       

In [28]:
arr = np.array([[4, 12, 6],
                [6, 8, 2],
                [5, 9, 1]], dtype=float)
print("Original Array:\n", arr)

Original Array:
 [[ 4. 12.  6.]
 [ 6.  8.  2.]
 [ 5.  9.  1.]]


In [29]:
# Shape and Resize
print("\nShape of array:", arr.shape)
print("Size of array:", arr.size)
print("Transpose of array:\n", arr.T)
print("Flattened array:\n", arr.flatten())


Shape of array: (3, 3)
Size of array: 9
Transpose of array:
 [[ 4.  6.  5.]
 [12.  8.  9.]
 [ 6.  2.  1.]]
Flattened array:
 [ 4. 12.  6.  6.  8.  2.  5.  9.  1.]


In [30]:
# Negative Indexing
print("\nLast row using negative indexing:", arr[-1])
print("Last element using negative indexing:", arr[-1, -1])


Last row using negative indexing: [5. 9. 1.]
Last element using negative indexing: 1.0


In [31]:
# Error in slicing
try:
    print(arr[-7])
except IndexError as e:
    print("\nIndexError:", e)


IndexError: index -7 is out of bounds for axis 0 with size 3


In [32]:
# Arithmetic Operations
# Broadcasting (adding scalar)
print("\nBroadcasting (arr + 5):\n", arr + 5)

# Dot product (matrix multiplication)
dot_product = np.dot(arr, arr)
print("\nDot Product (arr x arr):\n", dot_product)

# Linear Algebra Operations
det = np.linalg.det(arr)
print("\nDeterminant:", det)

# Inverse (only if determinant != 0)
if det != 0:
    inv = np.linalg.inv(arr)
    print("Inverse:\n", inv)
else:
    print("Matrix is singular, inverse does not exist")


Broadcasting (arr + 5):
 [[ 9. 17. 11.]
 [11. 13.  7.]
 [10. 14.  6.]]

Dot Product (arr x arr):
 [[118. 198.  54.]
 [ 82. 154.  54.]
 [ 79. 141.  49.]]

Determinant: 92.00000000000001
Inverse:
 [[-0.10869565  0.45652174 -0.26086957]
 [ 0.04347826 -0.2826087   0.30434783]
 [ 0.15217391  0.26086957 -0.43478261]]
