<center><h1>ADMN5015 Artificial Intelligence in Marketing</h1>
<h2>Assignment 1: Regression
<h3>Katrina Ong

---

### 1) Import Packages

In [9]:
#Importing Standard packages
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from datetime import datetime, date, timedelta
from time import strptime

In [2]:
#Web Scraping

#Library used for making HTTP Requests
import requests

#Library used to parse HTML 
from bs4 import BeautifulSoup as bs

In [3]:
#Data Preparation Packages
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
#Regressors
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lars
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import RandomForestRegressor

In [5]:
# Model Selection
from sklearn.model_selection import (RepeatedStratifiedKFold, 
                                     GridSearchCV, 
                                     RandomizedSearchCV,
                                     cross_val_score)

In [6]:
# Model Evaluation
from sklearn.metrics import mean_squared_error

In [7]:
# Set Options for display
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.4f}'.format
sns.set_style("whitegrid")
sns.set_context("paper", font_scale = 2)

%matplotlib inline

---

### 2) Scraping Website Information

In [10]:
# Defining Dates to Iterate through
start_date = date(2019,1,1)
end_date = date(2023,12,31)
date_range = pd.date_range(start_date,end_date)

In [13]:
# Checking access for first page of the website
url = "https://admn5015-340805.uc.r.appspot.com/2019-01-01.html"
response = requests.get(url, timeout=15)

In [14]:
html = response.status_code
html

200

In [29]:
#Creating a Dataframe to Store Website Data
labels = ('price','likes','dislikes','followers')
df = pd.DataFrame(columns = labels)
df.index.name = 'date'

In [30]:
df.head()

Unnamed: 0_level_0,price,likes,dislikes,followers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [46]:
# Looping through each page of the website to scrape data
for single_date in date_range:
    
    # Generate webpage URL
    single_date_parsed = single_date.strftime("%Y-%m-%d")
    url = f"https://admn5015-340805.uc.r.appspot.com/{single_date_parsed}.html"
    
    # Open URL
    response = requests.get(url, timeout=15)
    html = response.text

    # Parse HTML
    soup = bs(html, "html.parser")

    # Obtain Needed Data
    price = soup.find("td", {"id": "price"}).text
    likes = soup.find("td", {"id": "likes"}).text
    dislikes = soup.find("td", {"id": "dislikes"}).text
    followers = soup.find("td", {"id": "followers"}).text
    
    # Write data into dataframe
    df.loc[single_date_parsed] = [price,likes,dislikes,followers]

In [47]:
#Preview Start of Dataframe
df.head(3)

Unnamed: 0_level_0,price,likes,dislikes,followers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,$ 1000.47 CAD,9001,401,15002
2019-01-02,$ 999.94 CAD,9002,402,15004
2019-01-03,$ 999.41 CAD,9003,403,15006


In [48]:
#Preview End of Dataframe
df.tail(3)

Unnamed: 0_level_0,price,likes,dislikes,followers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-12-29,$ 22477.74 CAD,10924,1496,17468
2023-12-30,$ 22477.74 CAD,10924,1496,17468
2023-12-31,$ 22477.74 CAD,10924,1496,17468


In [49]:
#Check Datatypes and Null Values
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1826 entries, 2019-01-01 to 2023-12-31
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   price      1826 non-null   object
 1   likes      1826 non-null   object
 2   dislikes   1826 non-null   object
 3   followers  1826 non-null   object
dtypes: object(4)
memory usage: 151.3+ KB


In [56]:
# Remove Currency from Price Column
df['price'] = df['price'].str.replace('$', '')
df.head(3)

Unnamed: 0_level_0,price,likes,dislikes,followers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,1000.47 CAD,9001,401,15002
2019-01-02,999.94 CAD,9002,402,15004
2019-01-03,999.41 CAD,9003,403,15006


In [57]:
# Remove Currency from Price Column
df['price'] = df['price'].str.replace('CAD', '')
df.head(3)

Unnamed: 0_level_0,price,likes,dislikes,followers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,1000.47,9001,401,15002
2019-01-02,999.94,9002,402,15004
2019-01-03,999.41,9003,403,15006


In [58]:
# Convert Data to Numeric Types
df = df.apply(pd.to_numeric)

In [59]:
# Check Resulting Datatypes and Null Values
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1826 entries, 2019-01-01 to 2023-12-31
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   price      1826 non-null   float64
 1   likes      1826 non-null   int64  
 2   dislikes   1826 non-null   int64  
 3   followers  1826 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 151.3+ KB


In [60]:
# Save to CSV File
# df.to_csv("My Retail Store Website Data.csv")

---

### 3) Model Building

In [None]:
# Split


---

### 4) Model Evaluation