<a href="https://colab.research.google.com/github/hansrajmina/Assignment_phase5/blob/main/project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import scipy.stats as stats
df =pd.read_csv('data_YesBank_StockPrices.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.select_dtypes(include='number').corr()


In [None]:
corr_matrix = df[['Open', 'High', 'Low', 'Close']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Titles and layout
plt.title('Correlation Heatmap of Yes Bank Stock Data')
plt.tight_layout()
plt.show()

In [None]:

plt.figure(figsize=(10, 6))

# KDE for each column
sns.kdeplot(df['Open'], label='Open', shade=True)
sns.kdeplot(df['High'], label='High', shade=True)
sns.kdeplot(df['Low'], label='Low', shade=True)
sns.kdeplot(df['Close'], label='Close', shade=True)

plt.title('KDE Plot of Yes Bank Stock Prices')
plt.xlabel('Price')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

# Q-Q plot for 'Close' prices
plt.figure(figsize=(6, 6))
stats.probplot(df['Close'], dist='norm', plot=plt)
plt.title('Q-Q Plot of Close Prices')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%b-%y')  # convert to datetime
df = df.sort_values('Date')

# Plot
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Close'], label='Close Price', marker='o')
plt.title('Trend of Close Price Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df['Open'], df['Close'], label='Close Price', marker='o')
plt.xlabel('Open')
plt.ylabel('Close Price')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df['High'], df['Close'], label='Close Price', marker='o')
plt.xlabel('High')
plt.ylabel('Close Price')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df['Low'], df['Close'], label='Close Price', marker='o')
plt.xlabel('low')
plt.ylabel('Close Price')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
!pip install ydata-profiling


In [None]:
from ydata_profiling import ProfileReport

# Load your DataFrame
# df = pd.read_csv('your_file.csv')  # or however you get your DataFrame

# Create a profiling report
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

# Display within notebook (if using Jupyter/Colab)
profile.to_notebook_iframe()

# OR: Save to HTML
profile.to_file("pandas_profiling_report.html")


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Define features and target
features = ['Open', 'High', 'Low']
target = 'Close'

# Prepare data
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

# Store results
results = []

# 1. Linear Regression (no hyperparameters to tune)
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
results.append({
    'Model': 'Linear Regression',
    'RMSE': np.sqrt(mean_squared_error(y_test, lr_pred)),
    'R2': r2_score(y_test, lr_pred)
})

# 2. Random Forest Regressor
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5, None]
}
rf = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(X_test)
results.append({
    'Model': f'Random Forest ({rf_grid.best_params_})',
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_pred)),
    'R2': r2_score(y_test, rf_pred)
})

# 3. Gradient Boosting Regressor
gbr_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [2, 3]
}
gbr = GradientBoostingRegressor(random_state=42)
gbr_grid = GridSearchCV(gbr, gbr_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbr_grid.fit(X_train, y_train)
gbr_best = gbr_grid.best_estimator_
gbr_pred = gbr_best.predict(X_test)
results.append({
    'Model': f'Gradient Boosting ({gbr_grid.best_params_})',
    'RMSE': np.sqrt(mean_squared_error(y_test, gbr_pred)),
    'R2': r2_score(y_test, gbr_pred)
})

# Display all results
results_df = pd.DataFrame(results)
print(results_df)


In [None]:
#Hypothesis Tests

1. One-Sample t-test

In [None]:
from scipy.stats import ttest_1samp

# Test if mean of 'Close' price = 100
t_stat, p_val = ttest_1samp(df['Close'], 100)
print(f"t-statistic: {t_stat:.4f}, p-value: {p_val:.4f}")


In [None]:
 Two-Sample t-test

In [None]:
from scipy.stats import ttest_ind

t_stat, p_val = ttest_ind(df['Open'], df['Close'])
print(f"t-statistic: {t_stat:.4f}, p-value: {p_val:.4f}")
