In [36]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
import json
from pathlib import Path
from sklearn.linear_model import LinearRegression

# %%
# Load the data into a Pandas DataFrame
stock_df = pd.read_csv(
    "../Resources/data.csv",
    index_col="Company")

# %%
# Display sample data
stock_df.head(10)

# %%
# Remove the $ sign from the columns 
stock_df = stock_df.replace({'\$': ' '}, regex=True)

# Review the changes 
stock_df.head(5)

# %%
stock_df.info()

# %%
# Check for missing values
null_counts = stock_df.isnull().sum()
print(null_counts)

# %%
# Convert to datetime
stock_df['Date'] = pd.to_datetime(stock_df['Date'])

# Find the oldest and newest dates
oldest_date = stock_df['Date'].min()
newest_date = stock_df['Date'].max()

print(oldest_date)
print(newest_date)

# %%
# Sort the df by company and date 
stock_df.sort_values(by=['Company', 'Date'], inplace=True)
stock_df.head(5)

# %%
# Calculating the change between closing and opening prices per company
stock_df['Close/Last'] = pd.to_numeric(stock_df['Close/Last'], errors='coerce')
stock_df['Open'] = pd.to_numeric(stock_df['Open'], errors='coerce')

stock_df['Change'] = stock_df.groupby('Company')['Close/Last'].shift(1) - stock_df['Open']
stock_df.head(5)

# %%
# Delete null value in Change column
stock_df.dropna(subset=['Change'], inplace=True)
stock_df.head(5)



<class 'pandas.core.frame.DataFrame'>
Index: 25160 entries, AAPL to NFLX
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Date        25160 non-null  object
 1   Close/Last  25160 non-null  object
 2   Volume      25160 non-null  int64 
 3   Open        25160 non-null  object
 4   High        25160 non-null  object
 5   Low         25160 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB
Date          0
Close/Last    0
Volume        0
Open          0
High          0
Low           0
dtype: int64
2013-07-18 00:00:00
2023-07-17 00:00:00


Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low,Change
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,2013-07-19,15.1768,268548901,15.4679,15.4993,15.1554,-0.048
AAPL,2013-07-22,15.2254,207648981,15.3379,15.3482,15.1953,-0.1611
AAPL,2013-07-23,14.9639,354477618,15.2143,15.2486,14.9539,0.0111
AAPL,2013-07-24,15.7325,591624923,15.6761,15.8782,15.545,-0.7122
AAPL,2013-07-25,15.6607,229432412,15.7393,15.7643,15.5646,-0.0068


In [24]:

# %%
# Filter for AMD data only
amd_data = stock_df[stock_df.index == 'AMD']

amd_data.head()


Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low,Change
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AMD,2013-07-19,4.03,151446600,3.99,4.08,3.85,0.65
AMD,2013-07-22,3.9,49758410,4.005,4.01,3.885,0.025
AMD,2013-07-23,3.66,76717500,3.88,3.88,3.64,0.02
AMD,2013-07-24,3.63,50766260,3.69,3.75,3.58,-0.03
AMD,2013-07-25,3.7,29221360,3.64,3.74,3.6,-0.01


In [25]:

# %%
# Plot AMD data
amd_plot = amd_data.hvplot.scatter(
    x="Open",
    y="Close/Last",
    title="AMD Open vs Close prices"
)
amd_plot


In [26]:

# %%
# Reformat data of the independent variable X as a single-column array
X = amd_data["Open"].values.reshape(-1, 1)

# Display sample data
X[:5]

# %%
# The shape of X is 30 samples, with a single feature (column)
X.shape

# %%
# Create an array for the dependent variable y
y = amd_data["Close/Last"]

# %%
# Create a model with scikit-learn
model = LinearRegression()

# %%
# Fit the data into the model
model.fit(X, y)


In [27]:

# %%
# Display the slope
print(f"Model's slope: {model.coef_}")


Model's slope: [0.99881402]


In [28]:

# %%
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")


Model's y-intercept: 0.04652925152645082


In [29]:

# %%
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")


Model's formula: y = 0.04652925152645082 + 0.9988140216718926X


In [30]:

# %%
# Display the formula to predict the close price if the open price is $30
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 30")


Model's formula: y = 0.04652925152645082 + 0.9988140216718926 * 30


In [31]:

# predict the close price if the open price is $30
y_30 = model.intercept_ + model.coef_[0] * 30

# Display the prediction
print(f"Predicted stock closing price if the open price is $30: ${y_30:.2f}")


Predicted stock closing price after 30 days: $30.01


In [32]:

# %%
# Make predictions using the X set
predicted_y_values = model.predict(X)

# %%
# Create a copy of the original data
df_close_predicted = amd_data.copy()

# Add a column with the predicted close values
df_close_predicted["close_predicted"] = predicted_y_values

# Display sample data
df_close_predicted.head()


Unnamed: 0_level_0,Date,Close/Last,Volume,Open,High,Low,Change,close_predicted
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AMD,2013-07-19,4.03,151446600,3.99,4.08,3.85,0.65,4.031797
AMD,2013-07-22,3.9,49758410,4.005,4.01,3.885,0.025,4.046779
AMD,2013-07-23,3.66,76717500,3.88,3.88,3.64,0.02,3.921928
AMD,2013-07-24,3.63,50766260,3.69,3.75,3.58,-0.03,3.732153
AMD,2013-07-25,3.7,29221360,3.64,3.74,3.6,-0.01,3.682212


In [38]:

# %%
# Create a line plot of the predicted salary values
best_fit_line = df_close_predicted.hvplot.line(
    x = "Open",
    y = "close_predicted",
    color = "red"
)
best_fit_line


In [39]:

# %%
# Superpose the original data and the best fit line
amd_plot * best_fit_line


In [35]:

# %%
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

# %%
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.9984057118693271.
The r2 is 0.9984057118693271.
The mean squared error is 2.549012614446235.
The root mean squared error is 1.5965627499244228.
The standard deviation is 39.9855050542605.


In [37]:
models_info = {}
# Save model information
model_info = {
    "intercept": model.intercept_,
    "coef": model.coef_[0]
}
# Store the model information in the dictionary
models_info['AMD'] = model_info
# Save the model information to a JSON file named after the ticker symbol
filename = f"{'AMD'}_model_info.json"
with open(filename, "w") as json_file:
    json.dump(model_info, json_file)