# EV Stock Prediction Model
## Machine Learning for Predicting Good EV Stock Picks

This notebook builds a machine learning model to predict which electric vehicle (EV) stocks are good investment opportunities based on historical stock market data and technical indicators.

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

ImportError: DLL load failed while importing _pocketfft_umath: An Application Control policy has blocked this file.

## 2. Load and Explore EV Stock Data

In [None]:
# Load the stock market data
data_path = '../sql/stock_market_data.csv'
df = pd.read_csv(data_path, skiprows=[0, 2])  # Skip header rows with Ticker and Date labels

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head(10))
print("\nData Types:")
print(df.dtypes)
print("\nBasic Statistics:")
print(df.describe())

## 3. Data Preprocessing and Feature Engineering

In [None]:
# Extract TSLA (Tesla - EV stock) data for our model
tsla_close = pd.to_numeric(df[('Close', 'TSLA')], errors='coerce')
tsla_high = pd.to_numeric(df[('High', 'TSLA')], errors='coerce')
tsla_low = pd.to_numeric(df[('Low', 'TSLA')], errors='coerce')
tsla_open = pd.to_numeric(df[('Open', 'TSLA')], errors='coerce')
tsla_volume = pd.to_numeric(df[('Volume', 'TSLA')], errors='coerce')

# Create a dataframe with TSLA data
tsla_data = pd.DataFrame({
    'Close': tsla_close,
    'High': tsla_high,
    'Low': tsla_low,
    'Open': tsla_open,
    'Volume': tsla_volume
})

# Remove rows with missing values
tsla_data = tsla_data.dropna()

# Feature Engineering - Create technical indicators
# 1. Simple Moving Averages
tsla_data['SMA_5'] = tsla_data['Close'].rolling(window=5).mean()
tsla_data['SMA_20'] = tsla_data['Close'].rolling(window=20).mean()

# 2. Exponential Moving Average
tsla_data['EMA_12'] = tsla_data['Close'].ewm(span=12).mean()

# 3. Momentum (Price change over 5 days)
tsla_data['Momentum'] = tsla_data['Close'].pct_change(5)

# 4. Volatility (Standard deviation of returns)
tsla_data['Volatility'] = tsla_data['Close'].pct_change().rolling(window=5).std()

# 5. RSI (Relative Strength Index - simplified)
delta = tsla_data['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=5).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=5).mean()
rs = gain / loss
tsla_data['RSI'] = 100 - (100 / (1 + rs))

# 6. MACD (Moving Average Convergence Divergence)
ema_12 = tsla_data['Close'].ewm(span=12).mean()
ema_26 = tsla_data['Close'].ewm(span=26).mean()
tsla_data['MACD'] = ema_12 - ema_26

# 7. High-Low Range
tsla_data['HL_Range'] = (tsla_data['High'] - tsla_data['Low']) / tsla_data['Close']

# 8. Volume change
tsla_data['Volume_Change'] = tsla_data['Volume'].pct_change()

# Target variable: Create binary classification (1 = good pick, 0 = not a good pick)
# Define 'good pick' as days when price will go up more than certain threshold
future_return = tsla_data['Close'].shift(-5).pct_change() * 100  # 5-day future return
tsla_data['Target'] = (future_return > 2).astype(int)  # Good pick if > 2% return expected

# Drop rows with NaN (created by shifting and rolling operations)
tsla_data = tsla_data.dropna()

print("Feature-engineered data shape:", tsla_data.shape)
print("\nFeatures created:")
print(tsla_data.columns.tolist())
print("\nTarget variable distribution:")
print(tsla_data['Target'].value_counts())

## 4. Train-Test Split

In [None]:
# Select features for the model
feature_columns = ['SMA_5', 'SMA_20', 'EMA_12', 'Momentum', 'Volatility', 
                   'RSI', 'MACD', 'HL_Range', 'Volume_Change']

X = tsla_data[feature_columns]
y = tsla_data['Target']

# Normalize features for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"\nTraining set target distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTesting set target distribution:")
print(pd.Series(y_test).value_counts())

## 5. Build and Train the ML Model

In [None]:
# Train a Random Forest Classifier
# Random Forest is effective for stock prediction as it captures non-linear relationships
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train the model
print("Training the Random Forest model...")
rf_model.fit(X_train, y_train)
print("Model training complete!")

# Make predictions on training and test sets
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

print("\nModel training accuracy:", accuracy_score(y_train, y_train_pred))
print("Model testing accuracy:", accuracy_score(y_test, y_test_pred))

## 6. Model Evaluation and Performance Metrics

In [None]:
# Calculate detailed performance metrics
print("=" * 60)
print("MODEL PERFORMANCE METRICS - TEST SET")
print("=" * 60)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:")
print(cm)
print(f"\nTrue Negatives:  {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives:  {cm[1,1]}")

# Calculate ROC AUC
y_test_proba = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
roc_auc = auc(fpr, tpr)
print(f"\nROC AUC Score: {roc_auc:.4f}")

## 7. Make Predictions on New Data

In [None]:
# Make predictions on the most recent data
# Use the last 20 rows to see current predictions
recent_data = X_scaled[-20:]
recent_predictions = rf_model.predict(recent_data)
recent_probabilities = rf_model.predict_proba(recent_data)[:, 1]

# Create results dataframe
results_df = pd.DataFrame({
    'Prediction': recent_predictions,
    'Probability_Good_Pick': recent_probabilities,
    'Confidence': np.abs(recent_probabilities - 0.5) * 2  # Confidence score (0-1)
})

print("=" * 60)
print("RECENT EV STOCK (TSLA) PREDICTIONS - LAST 20 TRADING DAYS")
print("=" * 60)
print("\nPrediction: 1 = Good Pick (Expected >2% return), 0 = Not a Good Pick")
print("\n", results_df.to_string())

# Summary
good_picks = (recent_predictions == 1).sum()
total_predictions = len(recent_predictions)
print(f"\n\nSummary: {good_picks}/{total_predictions} recent days classified as 'Good Picks'")
print(f"Average confidence score: {results_df['Confidence'].mean():.4f}")

## 8. Visualize Results

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Feature Importance
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

axes[0, 0].barh(feature_importance['Feature'], feature_importance['Importance'], color='steelblue')
axes[0, 0].set_xlabel('Importance Score')
axes[0, 0].set_title('Feature Importance in EV Stock Prediction Model')
axes[0, 0].invert_yaxis()

# 2. Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1], 
            xticklabels=['Not Good Pick', 'Good Pick'],
            yticklabels=['Not Good Pick', 'Good Pick'])
axes[0, 1].set_ylabel('True Label')
axes[0, 1].set_xlabel('Predicted Label')
axes[0, 1].set_title('Confusion Matrix - Test Set')

# 3. ROC Curve
axes[1, 0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[1, 0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
axes[1, 0].set_xlim([0.0, 1.0])
axes[1, 0].set_ylim([0.0, 1.05])
axes[1, 0].set_xlabel('False Positive Rate')
axes[1, 0].set_ylabel('True Positive Rate')
axes[1, 0].set_title('ROC Curve')
axes[1, 0].legend(loc="lower right")

# 4. Prediction Distribution
axes[1, 1].hist(recent_probabilities, bins=15, color='green', alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Probability of Good Pick')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Distribution of Prediction Probabilities (Recent 20 Days)')
axes[1, 1].axvline(x=0.5, color='red', linestyle='--', label='Decision Threshold')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("\nVisualization complete!")