# Stock Price Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# 1. GENERATE DUMMY DATA (Since real files weren't provided)
np.random.seed(42)
dates = pd.date_range('2020-01-01', periods=200)
stock_df = pd.DataFrame({'Date': dates, 'Price': np.cumsum(np.random.randn(200)) + 100})
data_df = pd.DataFrame({'Date': dates, 'Fundamental_Value': np.random.randn(200) * 10 + 50})

print("Datasets loaded successfully!")

## Data Preprocessing

In [None]:
# Merging without checking for index alignment
df = pd.merge(stock_df, data_df, on='Date')

# Feature Engineering: Change from previous day
# POOR PRACTICE: Not shifting the target variable for 'forecasting'
df['Fundamental_Change'] = df['Fundamental_Value'].diff()

# Handling missing values by filling with 0 (Creates massive noise)
df.fillna(0, inplace=True)

# Normalizing the ENTIRE dataset before splitting (Major Data Leakage)
scaler = StandardScaler()
df[['Fundamental_Change']] = scaler.fit_transform(df[['Fundamental_Change']])

X = df[['Fundamental_Change']]
y = df['Price']

## Model Building

In [None]:
# POOR PRACTICE: Shuffling time-series data
# This allows the model to see the future to predict the past.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained.")

## Evaluation

In [None]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"R-squared: {r2}")
print("Interpretation: If R2 is high here, it's likely due to data leakage!")

## Visualizing Results

In [None]:
# Messy visualization: Plotting raw prices and normalized changes on the same scale
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Price'], label='Actual Stock Price', color='blue')
plt.plot(df['Date'], df['Fundamental_Change'], label='Input Feature (Scaled)', color='red', alpha=0.5)
plt.title("Inaccurate visualization of relationship")
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted (Shuffled Split)")
plt.show()