#Advertising

In [None]:
from google.colab import files
uploaded = files.upload()

Saving advertising.csv to advertising.csv


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('advertising.csv')
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


In [None]:
# Shape of the dataset(rows, columns)
print("Shape:", df.shape)

# Data types and null values(to spot missing data and datatype issue)
print("\nInfo:")
print(df.info())

# Unique values per column
print("\nUnique values:")
print(df.nunique())

print("\nMissing Values per column: \n", df.isnull().sum())

Shape: (200, 4)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB
None

Unique values:
TV           190
Radio        167
Newspaper    172
Sales        121
dtype: int64

Missing Values per column: 
 TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64


Handling Missing Values

In [None]:
# Replace missing values with column mean
df_mean = df.copy()
for col in df_mean.columns:
    if df_mean[col].isnull().sum() > 0:
        mean_val = df_mean[col].mean()
        df_mean[col].fillna(mean_val, inplace=True)


# Replace missing values with column median
df_median = df.copy()
for col in df_median.columns:
    if df_median[col].isnull().sum() > 0:
        median_val = df_median[col].median()
        df_median[col].fillna(median_val, inplace=True)


# Replace missing values with 0
df_zero = df.copy()
df_zero.fillna(0, inplace=True)

print("\nMissing Values After Filling with Mean:\n", df_mean.isnull().sum())



Missing Values After Filling with Mean:
 TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64


Normalization


In [None]:
# Min-Max Scaling
def min_max_scaling(data):
    scaled = data.copy()
    for col in scaled.select_dtypes(include=[np.number]).columns:
        min_val, max_val = scaled[col].min(), scaled[col].max()
        scaled[col] = (scaled[col] - min_val) / (max_val - min_val)
    return scaled


# Standard Scaling
def standard_scaling(data):
    scaled = data.copy()
    for col in scaled.select_dtypes(include=[np.number]).columns:
        mean_val, std_val = scaled[col].mean(), scaled[col].std()
        scaled[col] = (scaled[col] - mean_val) / std_val
    return scaled

df_minmax = min_max_scaling(df_mean)
df_standard = standard_scaling(df_mean)

print("\nFirst 5 rows after Min-Max Scaling:")
print(df_minmax.head())


First 5 rows after Min-Max Scaling:
         TV     Radio  Newspaper     Sales
0  0.775786  0.762097   0.605981  0.807087
1  0.148123  0.792339   0.394019  0.346457
2  0.055800  0.925403   0.606860  0.409449
3  0.509976  0.832661   0.511873  0.586614
4  0.609063  0.217742   0.510994  0.641732


Outlier Detection (Z-score method)

In [None]:
def detect_outliers(data, threshold=3):
    outliers = {}
    for col in data.select_dtypes(include=[np.number]).columns:
        mean_val = data[col].mean()
        std_val = data[col].std()
        z_scores = (data[col] - mean_val) / std_val
        outliers[col] = data[np.abs(z_scores) > threshold].index.tolist()
    return outliers

outliers = detect_outliers(df_mean)
print("\nOutlier indices per column:", outliers)

# Remove outliers (rows having any outlier)
outlier_indices = set([i for lst in outliers.values() for i in lst])
df_no_outliers = df_mean.drop(index=outlier_indices)
print(f"\nData shape before removing outliers: {df_mean.shape}")
print(f"Data shape after removing outliers: {df_no_outliers.shape}")



Outlier indices per column: {'TV': [], 'Radio': [], 'Newspaper': [16, 101], 'Sales': []}

Data shape before removing outliers: (200, 4)
Data shape after removing outliers: (198, 4)


Linear Regression

In [None]:
from numpy.linalg import inv

# Assuming 'Sales' is the target
X = df_no_outliers.drop(columns='Sales').values
y = df_no_outliers['Sales'].values

# Add bias column
X_b = np.c_[np.ones((X.shape[0], 1)), X]

# Normal Equation: θ = (X^T X)^(-1) X^T y
theta = inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
print("\nLinear Regression Coefficients:", theta)

# Predictions
y_pred = X_b.dot(theta)

# R² Score
SS_res = np.sum((y - y_pred) ** 2)
SS_tot = np.sum((y - np.mean(y)) ** 2)
r2_score = 1 - (SS_res / SS_tot)
print(f"R² Score: {r2_score:.4f}")


Linear Regression Coefficients: [4.60019484e+00 5.45445202e-02 1.06916097e-01 8.63138705e-04]
R² Score: 0.9013
