In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

### **WEEK 4 :**

LAB 16 :

In [28]:
path = "/content/day16_encoding.csv"
df = pd.read_csv(path)

le = LabelEncoder()
df["city_label"] = le.fit_transform(df["city"])
one_hot = pd.get_dummies(df, columns=["city"], prefix="city", drop_first=True)
print("Label classes:", list(le.classes_))
print(df[["city", "city_label"]].head())
print(one_hot.head())

Label classes: ['CHI', 'LA', 'NY', 'SF']
  city  city_label
0   NY           2
1   SF           3
2   LA           1
3  CHI           0
  membership  purchases  city_label  city_LA  city_NY  city_SF
0     bronze          3           2    False     True    False
1     silver          5           3    False    False     True
2       gold          1           1     True    False    False
3     bronze          2           0    False    False    False


LAB 17 :

In [29]:
path = "/content/day17_scaling.csv"
df = pd.read_csv(path)

num_cols = ["CRIM", "RM"]

mm = MinMaxScaler()
std = StandardScaler()
rob = RobustScaler()

mm_vals = mm.fit_transform(df[num_cols])
std_vals = std.fit_transform(df[num_cols])
rob_vals = rob.fit_transform(df[num_cols])

print("MinMax mean:", mm_vals.mean(axis=0))
print("Standard mean/std:", std_vals.mean(axis=0), std_vals.std(axis=0))
print("Robust median approx:", pd.DataFrame(rob_vals).median().tolist())


MinMax mean: [0.23979975 0.504     ]
Standard mean/std: [4.44089210e-17 2.22044605e-16] [1. 1.]
Robust median approx: [0.0, 0.0]


LAB 18 :

In [30]:
path = "/content/day18_binning.csv"
df = pd.read_csv(path)

width_bins = pd.cut(df["age"], bins=4)

quant_bins = pd.qcut(df["age"], q=4)

edges = [0, 13, 18, 65, 120]
labels = ["Child", "Teen", "Adult", "Senior"]
domain_bins = pd.cut(df["age"], bins=edges, labels=labels, right=False)

print(width_bins.value_counts())
print(quant_bins.value_counts())
print(domain_bins.value_counts())

age
(4.92, 25.0]    4
(45.0, 65.0]    2
(65.0, 85.0]    2
(25.0, 45.0]    1
Name: count, dtype: int64
age
(4.999, 16.0]    3
(16.0, 35.0]     2
(35.0, 60.0]     2
(60.0, 85.0]     2
Name: count, dtype: int64
age
Adult     4
Child     2
Senior    2
Teen      1
Name: count, dtype: int64


LAB 19 :

In [31]:
path = "/content/day19_transform.csv"
df = pd.read_csv(path)

raw = df["spend"]
log1p = np.log1p(raw)
sqrt = np.sqrt(raw)
pt = PowerTransformer(method="yeo-johnson", standardize=False)
yeo = pt.fit_transform(raw.to_frame())[:, 0]

print("Raw mean/std:", raw.mean(), raw.std())
print("Log1p mean/std:", log1p.mean(), log1p.std())
print("Sqrt mean/std:", sqrt.mean(), sqrt.std())
print("Yeo-Johnson mean/std:", yeo.mean(), yeo.std())

Raw mean/std: 3424.13525 2304.8601892454953
Log1p mean/std: 7.974718563826439 0.5565566404124079
Sqrt mean/std: 56.08832287716261 16.722244290095944
Yeo-Johnson mean/std: 4.059833955625494 0.11782510880007192


LAB 20 :

In [32]:
path = "/content/day20_integration.csv"
df = pd.read_csv(path)

baseline = df[["pages_viewed", "session_minutes", "basket_value"]].copy()

eng = df.copy()
eng["pages_per_min"] = eng["pages_viewed"] / (eng["session_minutes"] + 1e-6)
eng["basket_log1p"] = np.log1p(eng["basket_value"])

eng = pd.get_dummies(eng, columns=["city", "device_type"], drop_first=True)

num_cols = ["pages_viewed", "session_minutes", "basket_value", "pages_per_min", "basket_log1p"]
scaler = StandardScaler()
eng[num_cols] = scaler.fit_transform(eng[num_cols])

print("Baseline cols:", baseline.columns.tolist())
print("Engineered cols:", eng.columns.tolist())
print(eng.head())

Baseline cols: ['pages_viewed', 'session_minutes', 'basket_value']
Engineered cols: ['pages_viewed', 'session_minutes', 'basket_value', 'pages_per_min', 'basket_log1p', 'city_NY', 'city_SF', 'device_type_mobile']
   pages_viewed  session_minutes  basket_value  pages_per_min  basket_log1p  \
0     -0.412393        -0.115470     -0.606092      -0.980194     -0.362138   
1      1.237179         1.270171      0.404061       0.700141      0.599897   
2     -1.402136        -1.501111     -1.212183      -0.980198     -1.438105   
3      0.577350         0.346410      1.414214       1.260251      1.200347   

   city_NY  city_SF  device_type_mobile  
0     True    False                True  
1    False     True               False  
2    False    False                True  
3     True    False               False  


### **WEEK 5 :**

LAB 21 :

In [33]:
path= "/content/day21_housing.csv"
df = pd.read_csv(path)

df["price_per_sqft"] = df["price"] / df["sqft"].replace({0: np.nan})
df["price_per_sqft"] = df["price_per_sqft"].fillna(df["price_per_sqft"].median())

df["bedrooms_per_sqft"] = df["bedrooms"] / df["sqft"].replace({0: np.nan})
df["bedrooms_per_sqft"] = df["bedrooms_per_sqft"].fillna(0.0)

df["bathrooms_per_bedroom"] = df["bathrooms"] / df["bedrooms"].replace({0: np.nan})
df["bathrooms_per_bedroom"] = df["bathrooms_per_bedroom"].fillna(df["bathrooms_per_bedroom"].median())

print(df[["price", "sqft", "price_per_sqft", "bedrooms_per_sqft", "bathrooms_per_bedroom"]].head())

    price  sqft  price_per_sqft  bedrooms_per_sqft  bathrooms_per_bedroom
0  300000  1500      200.000000           0.002000               0.666667
1  450000  2000      225.000000           0.002000               0.750000
2  275000  1200      229.166667           0.002500               0.666667
3  600000  2400      250.000000           0.002083               0.800000


LAB 22 :

In [34]:
path = "/content/day22_interactions.csv"
df = pd.read_csv(path)

(df["feature1_x_feature2"]) = df["feature1"] * df["feature2"]
(df["feature1_plus_feature2"]) = df["feature1"] + df["feature2"]
(df["high_f1_and_flag"]) = ((df["feature1"] > df["feature1"].median()) & (df["flag"] == 1)).astype(int)

corrs = df[["feature1", "feature2", "feature1_x_feature2", "feature1_plus_feature2", "high_f1_and_flag", "target"]].corr()["target"]
print(corrs.sort_values(ascending=False))

target                    1.000000
feature2                  0.800000
high_f1_and_flag          0.774597
feature1_plus_feature2    0.698430
feature1_x_feature2       0.661055
feature1                  0.600000
Name: target, dtype: float64


LAB 23 :

In [35]:
path = "/content/day23_poly.csv"
df = pd.read_csv(path)

X = df[["x"]].values
y = df["y"].values

for degree in [1, 2, 5]:
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)
    model = LinearRegression().fit(X_poly, y)
    score = model.score(X_poly, y)
    print(f"Degree {degree} R^2:", score)

Degree 1 R^2: 0.5825014204273337
Degree 2 R^2: 0.9946427095161742
Degree 5 R^2: 0.9963962344548183


LAB 24 :

In [36]:
path = "/content/day24_selection.csv"
df = pd.read_csv(path)

y = df["target"]
X = df.drop(columns=["target"])

selector = VarianceThreshold(threshold=0.01)
X_var = selector.fit_transform(X)
kept_cols = X.columns[selector.get_support()]
X_var_df = pd.DataFrame(X_var, columns=kept_cols)
corr = X_var_df.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
removed = [c for c in upper.columns if any(upper[c] > 0.9)]
X_reduced = X_var_df.drop(columns=removed)

print("Kept after variance:", list(kept_cols))
print("Removed due to correlation:", removed)
print("Final columns:", X_reduced.columns.tolist())

Kept after variance: ['x1', 'x2']
Removed due to correlation: ['x2']
Final columns: ['x1']


LAB 25 :

In [37]:
path = "/content/day25_project.csv"
df = pd.read_csv(path)

eng = df.copy()
eng["price_per_sqft"] = eng["price"] / eng["sqft"].replace({0: np.nan})
eng["price_per_sqft"] = eng["price_per_sqft"].fillna(eng["price_per_sqft"].median())
eng["rooms_per_sqft"] = eng["rooms"] / eng["sqft"].replace({0: np.nan})
eng["rooms_per_sqft"] = eng["rooms_per_sqft"].fillna(0.0)
eng["price_x_rooms"] = eng["price"] * eng["rooms"]
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_vals = poly.fit_transform(eng[["price_per_sqft"]])
poly_cols = poly.get_feature_names_out(["price_per_sqft"])
poly_df = pd.DataFrame(poly_vals, columns=poly_cols)
eng = pd.concat([eng, poly_df], axis=1)

eng.to_csv("day25_engineered.csv", index=False)
print(eng.head())

    price  sqft  rooms  price_per_sqft  rooms_per_sqft  price_x_rooms  \
0  300000  1500      5      200.000000        0.003333        1500000   
1  450000  2000      6      225.000000        0.003000        2700000   
2  275000  1200      4      229.166667        0.003333        1100000   
3  600000  2400      7      250.000000        0.002917        4200000   

   price_per_sqft  price_per_sqft^2  
0      200.000000      40000.000000  
1      225.000000      50625.000000  
2      229.166667      52517.361111  
3      250.000000      62500.000000  
