In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [None]:
path = '/Users/thananpornsethjinda/Desktop/internship/ml/regression/50_startups/50_Startups.csv'

df = pd.read_csv(path)

array(['New York', 'California', 'Florida'], dtype=object)

#### Familiarising with the data set 

In [None]:
df.State.unique() # 3 categorical variables here that we have to encode using hot-one encoding

array(['New York', 'California', 'Florida'], dtype=object)

In [None]:
df.describe() # numerical variables are not standardised 

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [None]:
df.info() # so we have 4 numerical variables and 1 categorical variable

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


#### data cleaning pipeline

In [None]:
## data cleaning frame work 

def clean_dataframe(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    df = df.copy()

    # log helper
    def log(msg):
        if verbose:
            print(f"[INFO] {msg}")

    # 1. standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    log("Standardized column names.")

    # 2. remove exact duplicates
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        df.drop_duplicates(inplace=True)
        log(f"Removed {dup_count} duplicate rows.")

    # 3. trim and lowercase all string (object) values
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype(str).str.strip().str.lower()
    log("Standardized string columns (lowercase + trimmed).")

    # 4. detect missing values (including blanks and placeholders)
    placeholder_values = ['n/a', 'na', '--', '-', 'none', 'null', '', 'nan']
    df.replace(placeholder_values, np.nan, inplace=True)
    null_report = df.isnull().sum()
    null_report = null_report[null_report > 0]
    if not null_report.empty:
        log(f"Missing values found in columns:\n{null_report}")

    # 5. flag constant columns
    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
    if constant_cols:
        log(f"Constant columns (consider removing): {constant_cols}")

    # 6. flag high cardinality categorical columns
    high_card_cols = [col for col in df.select_dtypes(include='object') if df[col].nunique() > 100]
    if high_card_cols:
        log(f"High-cardinality columns (consider encoding strategies): {high_card_cols}")

    # 7. detect numeric outliers using IQR
    num_cols = df.select_dtypes(include=np.number).columns
    outlier_report = {}
    for col in num_cols:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower) | (df[col] > upper)][col].count()
        if outliers > 0:
            outlier_report[col] = outliers
    if outlier_report:
        log(f"Potential numeric outliers detected:\n{outlier_report}")
    
    # 8. convert applicable columns to category
    for col in df.select_dtypes(include='object'):
        n_unique = df[col].nunique()
        if n_unique < len(df) * 0.05:
            df[col] = df[col].astype('category')
    log("Converted suitable object columns to category dtype.")

    log("Data cleaning complete.")
    return df

## study this function later 

In [14]:
cleaned_df = clean_dataframe(df)

[INFO] Standardized column names.
[INFO] Standardized string columns (lowercase + trimmed).
[INFO] Potential numeric outliers detected:
{'profit': np.int64(1)}
[INFO] Converted suitable object columns to category dtype.
[INFO] Data cleaning complete.


#### standardisation of numerical values (feature scaling)

In [67]:
from sklearn.preprocessing import StandardScaler

numeric_columns = ['r&d_spend', 'administration', 'marketing_spend']

def featureStandardiser(numeric_columns: pd.DataFrame) -> np.ndarray:
    """function to standardise values in numeric columns"""

    scaler = StandardScaler()

    scaler.fit(cleaned_df[numeric_columns]) # we do this so that the scalar knows how much to scale by

    scaled_inputs = scaler.transform(cleaned_df[numeric_columns])

    return scaled_inputs


In [69]:
scaled_inputs = featureStandardiser(numeric_columns)

#### one-hot encoding

In [63]:
from sklearn.preprocessing import OneHotEncoder

def oneHotEncoder(column: pd.DataFrame) -> np.ndarray: 
    """Function that encodes the input columns """
    enc = OneHotEncoder()
    enc.fit(column)
    print(f"Categories to be encoded are {enc.categories_}")
    one_hot = enc.transform(column).toarray()
    return one_hot


In [65]:
column_to_be_encoded = cleaned_df[['state']]

encoded = oneHotEncoder(column_to_be_encoded)

cleaned_df[['california', 'florida', 'new_york']] = encoded # why 2 brackets here 

Categories to be encoded are [array(['california', 'florida', 'new york'], dtype=object)]


In [66]:
cleaned_df

Unnamed: 0,r&d_spend,administration,marketing_spend,state,profit,california,florida,new_york
0,165349.2,136897.8,471784.1,new york,192261.83,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,california,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,florida,191050.39,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,new york,182901.99,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,florida,166187.94,0.0,1.0,0.0
5,131876.9,99814.71,362861.36,new york,156991.12,0.0,0.0,1.0
6,134615.46,147198.87,127716.82,california,156122.51,1.0,0.0,0.0
7,130298.13,145530.06,323876.68,florida,155752.6,0.0,1.0,0.0
8,120542.52,148718.95,311613.29,new york,152211.77,0.0,0.0,1.0
9,123334.88,108679.17,304981.62,california,149759.96,1.0,0.0,0.0


#### multiple linear regression implementation using sklearn

In [None]:
## combine the numerical columns and categorical columns 

cat_cols = ['california', 'florida', 'new_york']
categorical_data = cleaned_df[cat_cols].values

In [None]:
inputs = np.concat((scaled_inputs, categorical_data), axis=1)
targets = cleaned_df.profit

array([[ 2.01641149e+00,  5.60752915e-01,  2.15394309e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.95586034e+00,  1.08280658e+00,  1.92360040e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.75436374e+00, -7.28257028e-01,  1.62652767e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 1.55478369e+00, -9.63646307e-02,  1.42221024e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.50493720e+00, -1.07991935e+00,  1.28152771e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 1.27980001e+00, -7.76239071e-01,  1.25421046e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.34006641e+00,  9.32147208e-01, -6.88149930e-01,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.24505666e+00,  8.71980011e-01,  9.32185978e-01,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 1.03036886e+00,  9.86952101e-01,  8.308

In [42]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(inputs, targets)

In [None]:
# get model parameters

print(f"Model co-efficients are {model.coef_}")

print(f"Model intercepts are {model.intercept_}")

# understand how the order is obtained 

Model co-efficients are [36626.42825194  -748.99746924  3266.21519397   -52.30059124
   146.48820162   -94.18761038]
Model intercepts are 112015.56896403241


In [44]:
weights_df = pd.DataFrame({
    'feature': np.append(numeric_columns + cat_cols, 1),
    'weight': np.append(model.coef_, model.intercept_)
})
weights_df.sort_values('weight', ascending=False)

Unnamed: 0,feature,weight
6,1,112015.568964
0,r&d_spend,36626.428252
2,marketing_spend,3266.215194
4,florida,146.488202
3,california,-52.300591
5,new_york,-94.18761
1,administration,-748.997469


In [45]:
# model evaluation and metric

from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

predicted_y = model.predict(inputs)

actual_y = targets

print(f"The MSE is {mean_squared_error(actual_y, predicted_y)}")

print(f"The RMSE is {root_mean_squared_error(actual_y, predicted_y)}")

print(f"The R^2 is {r2_score(actual_y, predicted_y)}")

The MSE is 78406792.88803767
The RMSE is 8854.761029414496
The R^2 is 0.9507524843355148


#### multiple linear regression using the normal equations

$\beta = (X^T X)^{-1} X^T y$

where $X$ is the design matrix and $y$ is the column vector of all observations

In [None]:
column_of_ones = np.ones((50, 1))

feature_matrix = np.concatenate((column_of_ones, inputs), axis=1)

targets

XTXinverse = np.linalg.inv(np.matmul(feature_matrix.T, feature_matrix))

XTy = np.matmul(feature_matrix.T, targets)

params = np.matmul(XTXinverse, XTy)

# why dont they agree with each other

[ 1.31072000e+05  1.99777677e+04 -4.50999054e+02 -1.20366986e+04
 -5.24288000e+05 -1.31072000e+05  0.00000000e+00]


In [52]:

print(f"Model co-efficients are {model.coef_}")

print(f"Model intercepts are {model.intercept_}")

Model co-efficients are [36626.42825194  -748.99746924  3266.21519397   -52.30059124
   146.48820162   -94.18761038]
Model intercepts are 112015.56896403241
