In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sm

In [None]:
import pathlib
import os

In [None]:
BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "input"
DATASET_DIR.mkdir(exist_ok=True, parents=True)

IMAGE_DIR = BASE_DIR / "images"
IMAGE_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
INPUT_PATH = os.path.join(DATASET_DIR, "Advertising.csv")

In [None]:
df = pd.read_csv(INPUT_PATH, usecols= lambda x : x != "Unnamed: 0")
df.sort_values(by=["sales", "TV", "radio", "newspaper"], ascending=False)
df.head()

In [None]:
plt.figure(figsize=(15, 8))
df.sales.plot(marker='', markerfacecolor='blue', markersize=12, color='skyblue', linewidth=1.5, linestyle="dashed")
df.TV.plot(marker='', color='darkcyan', linewidth=2, linestyle="dashdot")
df.newspaper.plot(marker='', color='coral', linewidth=1.5, linestyle='dotted')
df.radio.plot(marker='', color='red', linewidth=1.5, linestyle='solid')
plt.legend()
plt.savefig(os.path.join(IMAGE_DIR, "Trends.png"))

In [None]:
## Coorelation plot
corr = df.corr()
sns.heatmap(data=corr, xticklabels=corr.columns, yticklabels=corr.columns, 
            annot= True,
            cmap=sns.diverging_palette(220, 20, as_cmap=True))

plt.savefig(os.path.join(IMAGE_DIR, "correlation.png"))

In [None]:
# Splitting the dataframe

In [None]:
X = df.drop("sales", axis=1)
y = df.sales

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [None]:
feat_importance = pd.Series(model.feature_importances_, index= X.columns)
feat_importance.plot(kind='barh', figsize=(15, 8), cmap='RdYlGn')
plt.savefig(os.path.join(IMAGE_DIR, "Feature_Importance.png"))