In [17]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [18]:
# Load the data into a Pandas DataFrame
stock_df = pd.read_csv(
    "Resources/data.csv",
    index_col="Company")

In [19]:
# Display sample data
stock_df.head(10)

Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,07/17/2023,$193.99,50520160,$191.90,$194.32,$191.81
AAPL,07/14/2023,$190.69,41616240,$190.23,$191.1799,$189.63
AAPL,07/13/2023,$190.54,41342340,$190.50,$191.19,$189.78
AAPL,07-12-2023,$189.77,60750250,$189.68,$191.70,$188.47
AAPL,07-11-2023,$188.08,46638120,$189.16,$189.30,$186.60
AAPL,07-10-2023,$188.61,59922160,$189.26,$189.99,$187.035
AAPL,07-07-2023,$190.68,46815000,$191.41,$192.67,$190.24
AAPL,07-06-2023,$191.81,45156010,$189.84,$192.02,$189.20
AAPL,07-05-2023,$191.33,46920260,$191.565,$192.98,$190.62
AAPL,07-03-2023,$192.46,31346600,$193.78,$193.88,$191.76


In [20]:
# Remove the $ sign from the columns 
stock_df = stock_df.replace({'\$': ' '}, regex=True)

# Review the changes 
stock_df.head(5)

Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,07/17/2023,193.99,50520160,191.9,194.32,191.81
AAPL,07/14/2023,190.69,41616240,190.23,191.1799,189.63
AAPL,07/13/2023,190.54,41342340,190.5,191.19,189.78
AAPL,07-12-2023,189.77,60750250,189.68,191.7,188.47
AAPL,07-11-2023,188.08,46638120,189.16,189.3,186.6


In [21]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25160 entries, AAPL to NFLX
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Date        25160 non-null  object
 1   Close/Last  25160 non-null  object
 2   Volume      25160 non-null  int64 
 3   Open        25160 non-null  object
 4   High        25160 non-null  object
 5   Low         25160 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB


In [22]:
# Check for missing values
null_counts = stock_df.isnull().sum()
print(null_counts)

Date          0
Close/Last    0
Volume        0
Open          0
High          0
Low           0
dtype: int64


In [23]:
# Convert to datetime
stock_df['Date'] = pd.to_datetime(stock_df['Date'])

# Find the oldest and newest dates
oldest_date = stock_df['Date'].min()
newest_date = stock_df['Date'].max()

print(oldest_date)
print(newest_date)

2013-07-18 00:00:00
2023-07-17 00:00:00


In [24]:
# Sort the df by company and date 
stock_df.sort_values(by=['Company', 'Date'], inplace=True)
stock_df.head(5)

Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2013-07-18,15.4199,218632537,15.4779,15.5311,15.3789
AAPL,2013-07-19,15.1768,268548901,15.4679,15.4993,15.1554
AAPL,2013-07-22,15.2254,207648981,15.3379,15.3482,15.1953
AAPL,2013-07-23,14.9639,354477618,15.2143,15.2486,14.9539
AAPL,2013-07-24,15.7325,591624923,15.6761,15.8782,15.545


In [25]:
# Calculating the change between closing and opening prices per company
stock_df['Close/Last'] = pd.to_numeric(stock_df['Close/Last'], errors='coerce')
stock_df['Open'] = pd.to_numeric(stock_df['Open'], errors='coerce')

stock_df['Change'] = stock_df.groupby('Company')['Close/Last'].shift(1) - stock_df['Open']
stock_df.head(5)

Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low,Change
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,2013-07-18,15.4199,218632537,15.4779,15.5311,15.3789,
AAPL,2013-07-19,15.1768,268548901,15.4679,15.4993,15.1554,-0.048
AAPL,2013-07-22,15.2254,207648981,15.3379,15.3482,15.1953,-0.1611
AAPL,2013-07-23,14.9639,354477618,15.2143,15.2486,14.9539,0.0111
AAPL,2013-07-24,15.7325,591624923,15.6761,15.8782,15.545,-0.7122


In [26]:
# Delete null value in Change column
stock_df.dropna(subset=['Change'], inplace=True)
stock_df.head(5)

Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low,Change
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,2013-07-19,15.1768,268548901,15.4679,15.4993,15.1554,-0.048
AAPL,2013-07-22,15.2254,207648981,15.3379,15.3482,15.1953,-0.1611
AAPL,2013-07-23,14.9639,354477618,15.2143,15.2486,14.9539,0.0111
AAPL,2013-07-24,15.7325,591624923,15.6761,15.8782,15.545,-0.7122
AAPL,2013-07-25,15.6607,229432412,15.7393,15.7643,15.5646,-0.0068


In [27]:
# # Drop Date column
# stock_df.drop(columns=['Date'], inplace=True)
# stock_df.head(5)

In [42]:
appl_data = stock_df[stock_df.index == 'AAPL']

appl_data.head()

Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low,Change
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,2013-07-19,15.1768,268548901,15.4679,15.4993,15.1554,-0.048
AAPL,2013-07-22,15.2254,207648981,15.3379,15.3482,15.1953,-0.1611
AAPL,2013-07-23,14.9639,354477618,15.2143,15.2486,14.9539,0.0111
AAPL,2013-07-24,15.7325,591624923,15.6761,15.8782,15.545,-0.7122
AAPL,2013-07-25,15.6607,229432412,15.7393,15.7643,15.5646,-0.0068


In [49]:
appl_plot = appl_data.hvplot.scatter(
    x="Open",
    y="Close/Last",
    title="AAPL Open vs Close prices"
)
appl_plot

In [43]:
# Create a scatter plot with the stock information [AAPL]
stock_plot = stock_df.hvplot.scatter(
    x="Open",
    y="Close/Last",
    title="Open vs Close prices"
)
stock_plot

In [44]:
# Reformat data of the independent variable X as a single-column array
X = stock_df["Open"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[15.4679],
       [15.3379],
       [15.2143],
       [15.6761],
       [15.7393]])

In [45]:
# The shape of X is 30 samples, with a single feature (column)
X.shape

(25150, 1)

In [46]:
# Create an array for the dependent variable y
y = stock_df["Close/Last"]

In [47]:
# Create a model with scikit-learn
model = LinearRegression()

In [37]:
# Fit the data into the model
model.fit(X, y)

In [38]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [0.9995404]


In [39]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 0.0723079461976397


In [40]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 0.0723079461976397 + 0.9995403951996634X


In [None]:
# Display the formula to predict the close price after 30 days
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 30")

# Predict the salary for a person with 7 years of experience
y_30 = model.intercept_ + model.coef_[0] * 30

# Display the prediction
print(f"Predicted stock closing price after 30 days: ${y_30:.2f}")

In [None]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [None]:
# Create a copy of the original data
df_close_predicted = stock_df.copy()

# Add a column with the predicted salary values
df_close_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_close_predicted.head()

In [None]:
# Create a line plot of the predicted salary values
best_fit_line = df_close_predicted.hvplot.line(
    x = "Close/Last",
    y = "close_predicted",
    color = "red"
)
best_fit_line

In [None]:
# Superpose the original data and the best fit line
stock_plot * best_fit_line

In [None]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")