In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from google.colab import files
import io

In [None]:
# Upload the CSV file
print(" Please upload your file:")
uploaded = files.upload()
filename = list(uploaded.keys())[0]
print(f" Loading dataset: {filename}")
datasets = pd.read_csv(io.BytesIO(uploaded[filename]))
print(f"Shape: {datasets.shape}")
print("First 5 rows:")
print(datasets.head())

 Please upload your file:


Saving 50_Startups.csv to 50_Startups.csv
 Loading dataset: 50_Startups.csv
Shape: (50, 5)
First 5 rows:
   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [None]:
# Prepare the features and target
print(" Preparing features and target variable...")
X = datasets.iloc[:, :-1].values
Y = datasets.iloc[:, -1].values  # Last column is the target
print(f"Features shape: {X.shape}")
print(f"Target shape: {Y.shape}")

 Preparing features and target variable...
Features shape: (50, 4)
Target shape: (50,)


In [None]:
# Encode the categorical data (State column)
print("Encoding categorical column (State)...")
# Select the 'State' column from the original DataFrame
X_state = datasets.iloc[:, 3].values

# Apply Label Encoding
labelencoder_X = LabelEncoder()
X_state_encoded = labelencoder_X.fit_transform(X_state.ravel())

# Apply One-Hot Encoding
onehotencoder = OneHotEncoder(sparse_output=False)
X_encoded = onehotencoder.fit_transform(X_state_encoded.reshape(-1, 1))

# Prepare the numerical features
X_numerical = datasets.iloc[:, :3].values

# Concatenate the encoded and numerical features
X = np.concatenate((X_encoded, X_numerical), axis=1)
print("Categorical feature encoded and merged. New features shape:", X.shape)

# Step 5: Avoid the Dummy Variable Trap
# Remove first dummy variable column
X = X[:, 1:]
print("Dummy variable trap avoided. Final features shape:", X.shape)

Encoding categorical column (State)...
Categorical feature encoded and merged. New features shape: (50, 6)
Dummy variable trap avoided. Final features shape: (50, 5)


In [None]:
# Avoid the Dummy Variable Trap
# Remove first dummy variable column
X = X[:, 1:]
print("Dummy variable trap avoided. New features shape:", X.shape)

Dummy variable trap avoided. New features shape: (50, 4)


In [None]:
# Split train and test sets
print("Splitting dataset into training and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0)
print((f"Training set size: {len(X_train)} samples\n"
       f"Test set size: {len(X_test)} samples"))

Splitting dataset into training and test sets...
Training set size: 40 samples
Test set size: 10 samples


In [None]:
# Fit Multiple Linear Regression
print("Fitting Multiple Linear Regression model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("Model trained!")
print("Coefficients:", regressor.coef_)
print("Intercept:", regressor.intercept_)

Fitting Multiple Linear Regression model...
Model trained!
Coefficients: [1.05221988e+03 7.74682581e-01 3.18348683e-02 3.57342438e-02]
Intercept: 42410.30908342115


In [None]:
# Predict Test Set results
print(" Predicting test set results...")
Y_pred = regressor.predict(X_test)
print("Actual vs Predicted (first 10 samples):")
for i in range(min(10, len(Y_test))):
    print(f"Actual: {Y_test[i]:.0f}, Predicted: {Y_pred[i]:.0f}, Diff: {abs(Y_test[i] - Y_pred[i]):.0f}")

 Predicting test set results...
Actual vs Predicted (first 10 samples):
Actual: 103282, Predicted: 103616, Diff: 333
Actual: 144259, Predicted: 132246, Diff: 12014
Actual: 146122, Predicted: 133070, Diff: 13052
Actual: 77799, Predicted: 72592, Diff: 5206
Actual: 191050, Predicted: 179076, Diff: 11974
Actual: 105008, Predicted: 116014, Diff: 11006
Actual: 81229, Predicted: 67854, Diff: 13375
Actual: 97484, Predicted: 98837, Diff: 1354
Actual: 110352, Predicted: 114480, Diff: 4128
Actual: 166188, Predicted: 168493, Diff: 2305


In [None]:
# Backward Elimination for optimal features
print(" Starting Backward Elimination!")
X_with_const = np.append(np.ones((X.shape[0],1)), X, axis=1)  # Add column of ones for intercept
selected_cols = list(range(X_with_const.shape[1]))
Y_for_ols = Y

def backward_elimination(X, Y, significance_level=0.05):
    cols = list(range(X.shape[1]))
    while True:
        regressor_OLS = sm.OLS(Y, X[:, cols]).fit()
        p_values = regressor_OLS.pvalues
        if p_values.max() > significance_level:
            idx_to_remove = p_values.argmax()
            print(f"Dropping column {cols[idx_to_remove]} with p-value {p_values.max():.4f}")
            cols.pop(idx_to_remove)
        else:
            break
    print("Optimal columns:", cols)
    print(regressor_OLS.summary())
    return cols

optimal_cols = backward_elimination(X_with_const, Y_for_ols)

X_optimal = X_with_const[:, optimal_cols]

 Starting Backward Elimination!
Dropping column 1 with p-value 0.9614
Dropping column 3 with p-value 0.6018
Dropping column 4 with p-value 0.0600
Optimal columns: [0, 2]
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     849.8
Date:                Mon, 25 Aug 2025   Prob (F-statistic):           3.50e-32
Time:                        07:55:42   Log-Likelihood:                -527.44
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      48   BIC:                             1063.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err        

In [None]:
# Train/test split for optimal features
print(" Splitting optimal features for train/test...")
X_optimal_train, X_optimal_test, Y_optimal_train, Y_optimal_test = train_test_split(
    X_optimal, Y, test_size=0.2, random_state=0)

# Fit and predict with optimal features
regressor_opt = LinearRegression()
regressor_opt.fit(X_optimal_train, Y_optimal_train)
Y_optimal_pred = regressor_opt.predict(X_optimal_test)
print("First 10 predictions (optimal model):")
for i in range(min(10, len(Y_optimal_test))):
    print(f"Actual: {Y_optimal_test[i]:.0f}, Predicted: {Y_optimal_pred[i]:.0f}, Diff: {abs(Y_optimal_test[i] - Y_optimal_pred[i]):.0f}")

 Splitting optimal features for train/test...
First 10 predictions (optimal model):
Actual: 103282, Predicted: 104667, Diff: 1385
Actual: 144259, Predicted: 134151, Diff: 10109
Actual: 146122, Predicted: 135208, Diff: 10914
Actual: 77799, Predicted: 72171, Diff: 5628
Actual: 191050, Predicted: 179091, Diff: 11960
Actual: 105008, Predicted: 109825, Diff: 4816
Actual: 81229, Predicted: 65644, Diff: 15585
Actual: 97484, Predicted: 100481, Diff: 2998
Actual: 110352, Predicted: 111432, Diff: 1080
Actual: 166188, Predicted: 169438, Diff: 3250
