## Task 1: Predictive Modeling (Regression)

- Description: Build and evaluate a regression model to predict a continuous variable (e.g., house prices).


In [2]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

- Split the dataset into training and testing sets.


In [3]:
# Loading the  preprocessed dataset
prep_dataset = pd.read_csv(r"D:\codeveda_internship\290325\cleaned_scaled_stock_prices.csv")


In [4]:
prep_dataset.head()

Unnamed: 0,symbol,date,open,high,low,close,volume
0,1,2014-01-02,-1.019921,-1.010001,-1.015134,-1.014292,8998943
1,3,2014-01-02,0.071231,0.060759,0.075483,0.063546,58791957
2,2,2014-01-02,0.693568,0.704232,0.692347,0.680648,542711
3,4,2014-01-02,-0.476483,-0.481947,-0.478746,-0.479576,4569061
4,5,2014-01-02,-0.115061,-0.125396,-0.114667,-0.119818,1148391


In [5]:
prep_dataset['date'] = pd.to_datetime(prep_dataset['date'])

In [6]:
# extracting the  date features
prep_dataset["Year"] = prep_dataset["date"].dt.year
prep_dataset["Month"] = prep_dataset["date"].dt.month
prep_dataset["Day"] = prep_dataset["date"].dt.day
prep_dataset["DayOfWeek"] = prep_dataset["date"].dt.dayofweek  # Monday=0, Sunday=6
prep_dataset["IsWeekend"] = (prep_dataset["DayOfWeek"] >= 5).astype(int)  # 1 if weekend, else 0
prep_dataset["Quarter"] = prep_dataset["date"].dt.quarter  # Quarter of the year
prep_dataset["DayOfYear"] = prep_dataset["date"].dt.dayofyear  # Day number in the year

In [7]:
# Dropping the original date column as it's no longer needed
prep_dataset.drop(columns=["date"], inplace=True)

In [8]:
prep_dataset.head()

Unnamed: 0,symbol,open,high,low,close,volume,Year,Month,Day,DayOfWeek,IsWeekend,Quarter,DayOfYear
0,1,-1.019921,-1.010001,-1.015134,-1.014292,8998943,2014,1,2,3,0,1,2
1,3,0.071231,0.060759,0.075483,0.063546,58791957,2014,1,2,3,0,1,2
2,2,0.693568,0.704232,0.692347,0.680648,542711,2014,1,2,3,0,1,2
3,4,-0.476483,-0.481947,-0.478746,-0.479576,4569061,2014,1,2,3,0,1,2
4,5,-0.115061,-0.125396,-0.114667,-0.119818,1148391,2014,1,2,3,0,1,2


In [9]:
# spliting the dataset into features and target variable
x = prep_dataset.drop(columns=["volume"])
y = prep_dataset["volume"]

In [10]:
# Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


- Train a linear regression model using scikit-learn.

In [11]:
# model = sklearn.linear_model.Ridge(alpha = 1.0)
model  = sklearn.ensemble.RandomForestRegressor(n_estimators = 200)
# model = sklearn.ensemble.GradientBoostingRegressor()
model.fit(x_train, y_train)


In [28]:
# len(model.feature_importances_)
for i, f in enumerate(model.feature_importances_, start= 1):
    print(i, f)

1 0.5044983661114137
2 0.04581964817576289
3 0.08109533288757544
4 0.19069513971428564
5 0.04097840396617039
6 0.022955697600745582
7 0.006398709736751993
8 0.027630134970997013
9 0.013584617465063375
10 0.0
11 0.002411073295078814
12 0.06393287607615523


In [12]:
y_pred = model.predict(x_test)

- Evaluate the model using performance metrics like
mean squared error (MSE) and R-squared.

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

In [15]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

MAE: 1141670.92
MSE: 12757508192445.38
RMSE: 3571765.42
R² Score: 0.82


- Experiment with multiple models (e.g., Decision Trees,
Random Forest) and compare performance.

In [16]:
model_dt = sklearn.tree.DecisionTreeRegressor()
model_dt.fit(x_train, y_train)

In [17]:
y_pred = model_dt.predict(x_test)

In [18]:
mae_dt = mean_absolute_error(y_test, y_pred)
mse_dt = mean_squared_error(y_test, y_pred)
rmse_dt = root_mean_squared_error(y_test, y_pred)
r2_dt = r2_score(y_test, y_pred)


print(f"MAE: {mae_dt:.2f}")
print(f"MSE: {mse_dt:.2f}")
print(f"RMSE: {rmse_dt:.2f}")
print(f"R² Score: {r2_dt:.2f}")

MAE: 1515188.26
MSE: 20404959381952.53
RMSE: 4517184.90
R² Score: 0.71


In [20]:
# comparing the  both.
print(f"mse of randomforest {mse} and mse of decision tree {mse_dt}")
print(f"mae of randomforest {mae} and mae of decision tree {mae_dt}")
print(f"rmse of randomforest {rmse} and rmse of decision tree {rmse_dt}")
print(f"r2_score of randomforest {r2} and r2_score of decision tree {r2_dt}")

mse of randomforest 12757508192445.379 and mse of decision tree 20404959381952.527
mae of randomforest 1141670.9242640857 and mae of decision tree 1515188.264720339
rmse of randomforest 3571765.416771569 and rmse of decision tree 4517184.895701363
r2_score of randomforest 0.8212076668015809 and r2_score of decision tree 0.7140311225605442


- Tools: Python, scikit-learn, pandas, matplotlib.