In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score


In [3]:
# Load dataset
df = pd.read_csv("nifty_500.csv")


In [5]:
# Data Cleaning
# Remove any leading spaces in column names
df.columns = df.columns.str.strip()


In [29]:
# Check existing columns and convert numerical columns from string to float
num_cols = ["Change", "Percentage Change", "Last Traded Price", "Open", "High", "Low", "Previous Close", "Volume"]
num_cols = [col for col in num_cols if col in df.columns]

for col in num_cols:
    df[col] = df[col].astype(str).str.replace(",", "", regex=True)  # Remove commas
    df[col] = df[col].replace(["-", " "], np.nan)  # Replace dashes and spaces with NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric


In [31]:
# Drop unnecessary columns
drop_cols = ["Company Name", "Symbol"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

In [33]:
# Encode categorical variables if they exist
categorical_cols = ["Industry", "Series"]
categorical_cols = [col for col in categorical_cols if col in df.columns]
if categorical_cols:
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [35]:
# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)


In [39]:
# Ensure all feature columns are numeric before scaling
for col in feature_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert non-numeric to NaN

# Fill remaining NaNs with median to prevent errors
df[feature_columns] = df[feature_columns].fillna(df[feature_columns].median())

# Verify again if all columns are numeric
non_numeric_cols = df[feature_columns].select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    print("Warning: Non-numeric columns found after conversion:", non_numeric_cols)

# Feature Scaling (should now work correctly)
scaled_features = scaler.fit_transform(df[feature_columns])


       'Industry_Construction Materials', 'Industry_Consumer Durables',
       'Industry_Consumer Services', 'Industry_Diversified',
       'Industry_Fast Moving Consumer Goods', 'Industry_Financial Services',
       'Industry_Forest Materials', 'Industry_Healthcare',
       'Industry_Information Technology',
       'Industry_Media Entertainment & Publication',
       'Industry_Metals & Mining', 'Industry_Oil Gas & Consumable Fuels',
       'Industry_Power', 'Industry_Realty', 'Industry_Services',
       'Industry_Telecommunication', 'Industry_Textiles', 'Series_EQ'],
      dtype='object')


In [63]:
# Feature Scaling with proper error handling
scaler = StandardScaler()
feature_columns = df.drop(columns=["Last Traded Price", "Percentage Change"], errors='ignore').columns

# Ensure there are no NaN values before scaling
df[feature_columns] = df[feature_columns].fillna(0)

try:
    scaled_features = scaler.fit_transform(df[feature_columns])
    df_scaled = pd.DataFrame(scaled_features, columns=feature_columns)
    df_scaled["Last Traded Price"] = df["Last Traded Price"].astype(float).values
    df_scaled["Percentage Change"] = df["Percentage Change"].astype(float).values
except Exception as e:
    print("Error in feature scaling:", e)
    df_scaled = None  # Prevent further errors if scaling fails

# Ensure df_scaled is not None before proceeding
if df_scaled is not None:
    # Regression Task
    X_reg = df_scaled.drop(columns=["Last Traded Price", "Percentage Change"])
    y_reg = df_scaled["Last Traded Price"]
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

    model_reg = LinearRegression()
    model_reg.fit(X_train_reg, y_train_reg)
    y_pred_reg = model_reg.predict(X_test_reg)

    print("Regression Metrics:")
    print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
    print("RMSE:", np.sqrt(mean_squared_error(y_test_reg, y_pred_reg)))


   


Regression Metrics:
MAE: 0.022348988817939706
RMSE: 0.030717989050757885


In [59]:
# Classification Task
df_scaled["Target"] = (df_scaled["Percentage Change"] > 0).astype(int)
X_cls = df_scaled.drop(columns=["Last Traded Price", "Percentage Change", "Target"])
y_cls = df_scaled["Target"]
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

model_cls = SVC(kernel='linear')
model_cls.fit(X_train_cls, y_train_cls)
y_pred_cls = model_cls.predict(X_test_cls)

print("Classification Metrics:")
print("Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print("Precision:", precision_score(y_test_cls, y_pred_cls))
print("Recall:", recall_score(y_test_cls, y_pred_cls))
print("F1-score:", f1_score(y_test_cls, y_pred_cls))


Classification Metrics:
Accuracy: 0.8613861386138614
Precision: 0.7936507936507936
Recall: 0.9803921568627451
F1-score: 0.8771929824561403
