In [1]:
# --- IMPORTS ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

In [2]:
# --- TASK 1: DATA LOADING ---
df = pd.read_csv('stock_data.csv')
print("--- Data Shape ---")
print(df.shape)
print("\n--- First 5 Rows ---")
print(df.head())

--- Data Shape ---
(365, 6)

--- First 5 Rows ---
   Unnamed: 0     Stock_1     Stock_2    Stock_3     Stock_4     Stock_5
0  2020-01-01  101.764052  100.160928  99.494642   99.909756  101.761266
1  2020-01-02  102.171269   99.969968  98.682973  100.640755  102.528643
2  2020-01-03  103.171258   99.575237  98.182139  100.574847  101.887811
3  2020-01-04  105.483215   99.308641  97.149381  100.925017  101.490049
4  2020-01-05  107.453175   98.188428  99.575396  101.594411  101.604283


In [3]:
# --- TASK 2: DATA PREPROCESSING ---
print("\n--- Preprocessing ---")

df.dropna(inplace=True)


df = df.rename(columns={'Unnamed: 0': 'Date'})
df['Date'] = pd.to_datetime(df['Date'])


df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek

df['Competitor_Mean'] = df[['Stock_2', 'Stock_3', 'Stock_4', 'Stock_5']].mean(axis=1)

Q1 = df['Stock_1'].quantile(0.25)
Q3 = df['Stock_1'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_clean = df[(df['Stock_1'] >= lower_bound) & (df['Stock_1'] <= upper_bound)].copy()
print(f"Original Data Size: {len(df)}, After Outlier Removal: {len(df_clean)}")


feature_cols = ['Stock_2', 'Stock_3', 'Stock_4', 'Stock_5', 'Month', 'Day', 'DayOfWeek', 'Competitor_Mean']
X = df_clean[feature_cols]
y = df_clean['Stock_1']


--- Preprocessing ---
Original Data Size: 365, After Outlier Removal: 365


In [4]:
# --- TASK 3: PIPELINE CREATION ---
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(random_state=42))
])

## --- TASK 4: PRIMARY MODEL SELECTION ---
### Selected Algorithm: Random Forest Regressor
### Justification: Random Forest is selected because it handles non-linear relationships between stock prices well,
### is robust to noise/outliers, and does not require strict assumptions about data distribution (like Normality) compared to Linear Regression.

In [5]:
# --- TASK 5: MODEL TRAINING ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
print("\nModel successfully trained.")


Model successfully trained.


In [6]:
# --- TASK 6: CROSS-VALIDATION ---
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print(f"\nCross-Validation R2 Scores: {cv_scores}")
print(f"Average R2 Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")


Cross-Validation R2 Scores: [0.9346795  0.95288927 0.92824246 0.96185399 0.879541  ]
Average R2 Score: 0.9314 (+/- 0.0286)


In [7]:
# --- TASK 7: HYPERPARAMETER TUNING ---
print("\n--- Hyperparameter Tuning ---")
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters found:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


--- Hyperparameter Tuning ---
Best Parameters found: {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best CV Score: 0.9330192769002567


In [8]:
# --- TASK 8: BEST MODEL SELECTION ---
best_model = grid_search.best_estimator_

In [9]:
# --- TASK 9: MODEL PERFORMANCE EVALUATION ---
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Test Set Evaluation ---")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score: {r2:.4f}")


--- Test Set Evaluation ---
Mean Squared Error (MSE): 1.8556
Mean Absolute Error (MAE): 1.0209
R2 Score: 0.9614


In [10]:

joblib.dump(best_model, 'stock_model.pkl')
print("\nModel saved as 'stock_model.pkl'")


Model saved as 'stock_model.pkl'
