In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('advertising.csv')
df.head(10)

X = df.drop(['Sales'], axis=1)
y = df['Sales']

In [None]:
plt.scatter(df.TV, df.Sales, marker="+", color='gray')
plt.xlabel("TV")
plt.ylabel("Sales")
plt.title('Sales from spendings on TV advertisement');

In [None]:
model = LinearRegression()
model.fit(df.TV.values.reshape(-1,1), df.Sales)

In [None]:
model.coef_, model.intercept_

$$ \text{Sales} = 0.038324 \times \text{Newspaper} + 13.95954865 $$

In [None]:
x_line = np.linspace(0,300,100)
y_line = x_line * model.coef_ + model.intercept_

y_pred = model.predict(df.TV.values.reshape(-1,1))

plt.scatter(df.TV, df.Sales, marker="+", color='gray')
plt.plot(x_line, y_line, color='red', alpha=0.3, label=f'linear fit, R^squared={np.round(r2_score(y, y_pred), 3)}')
plt.xlabel("TV")
plt.ylabel("Sales")
plt.legend()
plt.title('Sales from spendings on TV advertisement');

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(25,5))

axs[0].scatter(df.TV, df.Sales, marker="+", color='gray')
axs[0].set_xlabel("TV")
axs[0].set_ylabel("Sales")


axs[1].scatter(df.Radio, df.Sales, marker="+", color='gray')
axs[1].set_xlabel("Radio")
axs[1].set_ylabel("Sales")


axs[2].scatter(df.Newspaper, df.Sales, marker="+", color='gray')
axs[2].set_xlabel("Newspaper")
axs[2].set_ylabel("Sales");

# Train model on all features

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                        train_size=0.8, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)

print(f'Train set R-squared: {r2_score(y_train, y_pred_train)}')

In [None]:
y_pred_test = model.predict(X_test)

In [None]:
print(f'Test set R-squared: {r2_score(y_test, y_pred_test)}')

In [None]:
model.coef_, model.intercept_

$$ \text{Sales} = 0.054511 \times \text{TV} + 0.100945 \times \text{Radio} + 0.004336 \times \text{Newspaper} +  4.714126$$

### Why `TV` has smaller weight than `Radio`?

In [None]:
df.mean()

# Scale the data!

In [None]:
# Scale data
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)

print(f'Train set R-squared: {r2_score(y_train, y_pred_train)}')
y_pred_test = model.predict(X_test)
print(f'Test set R-squared: {r2_score(y_test, y_pred_test)}')

In [None]:
model.coef_

In [None]:
model.intercept_

$$ \text{Sales} = 4.58720 \times \text{TV}_{\text{scaled}} + 1.48984 \times \text{Radio}_{\text{scaled}} + 0.087915 \times \text{Newspaper}_{\text{scaled}} + 15.3306$$