# Regression assumptions

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
# Load the wine dataset
wine_data = load_wine()
df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)

# We'll try to predict 'alcohol' based on other features
X = df.drop('alcohol', axis=1)
X = sm.add_constant(X)  # Add a constant column for the intercept term
y = df['alcohol']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
residuals = model.resid

In [None]:
# 1. Linearity: Scatter plot of observed vs predicted values
plt.scatter(y, predictions)
plt.xlabel("Observed values")
plt.ylabel("Predicted values")
plt.title("Observed vs. Predicted Values")
plt.show()

In [None]:
# 2. Independence: Residuals vs. Order plot
plt.scatter(range(len(residuals)), residuals)
plt.xlabel("Order")
plt.ylabel("Residuals")
plt.title("Residuals vs. Order")
plt.show()

In [None]:
# 3. Homoscedasticity: Scatter plot of predicted values vs. residuals
plt.scatter(predictions, residuals)
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.title("Predicted vs. Residuals")
plt.show()

In [None]:
# 4. Normality of errors: QQ-plot
sm.qqplot(residuals, line='45')
plt.title("QQ-plot of Residuals")
plt.show()

In [None]:
# 5. No Multicollinearity: VIF (Variance Inflation Factor)
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

# QQ-plot for residual error

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the California housing dataset
california_data = fetch_california_housing()
california_df = pd.DataFrame(california_data.data, columns=california_data.feature_names)
california_df['MedHouseVal'] = california_data.target

# Define independent (X) and dependent (y) variables
X = california_df['MedInc']  # Median Income
y = california_df['MedHouseVal']  # Median house value

# Add a constant to X (required for statsmodels regression model)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Get the residuals
residuals = model.resid

# Create QQ-plot for the residuals
plt.figure(figsize=(8, 6))
qqplot(residuals, line='s')
plt.title("QQ-plot of Residuals for California Housing Model")
plt.show()


In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot

# Generate a simple dataset
np.random.seed(42)  # Setting a seed for reproducibility
X = 2.5 * np.random.randn(100) + 1.5   # Array of 100 values with mean = 1.5, stddev = 2.5
res = 0.5 * np.random.randn(100)       # Generate 100 residual terms
y = 2 + 0.3 * X + res                  # Actual values of Y

# Convert X to a 2D array
X_2D = X.reshape(-1, 1)

# Add a constant to X (required for statsmodels regression model)
X_const = sm.add_constant(X_2D)

# Fit the linear regression model
model = sm.OLS(y, X_const).fit()

# Get the residuals
residuals = model.resid

# Get the fitted values (predictions)
y_pred = model.predict(X_const)

# Scatter plot of data and regression line
plt.figure(figsize=(8, 6))
plt.scatter(X, y, label='Data', alpha=0.5)
plt.plot(X, y_pred, color='red', label='Regression Line')
plt.xlabel('X values')
plt.ylabel('y values')
plt.title('Synthetic Data and Fitted Regression Line')
plt.legend()
plt.show()

# Create QQ-plot for the residuals
plt.figure(figsize=(8, 6))
qqplot(residuals, line='s')
plt.title("QQ-plot of Residuals for Simple Synthetic Dataset")
plt.show()


# Multivariate analysis 1

In [11]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Load the tips dataset
tips = sns.load_dataset('tips')

In [None]:
# Display the first few rows of the dataset to understand its structure
print("First few rows of the dataset:")
tips.head()

In [None]:
# Display basic statistics about the dataset
print("\nBasic statistics of the dataset:")
tips.describe()

In [None]:
# Plot the distribution of 'total_bill' and 'tip' using histograms
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(tips['total_bill'], kde=True)
plt.title('Distribution of Total Bill')

plt.subplot(1, 2, 2)
sns.histplot(tips['tip'], kde=True)
plt.title('Distribution of Tips')

plt.tight_layout()
plt.show()

# Scatter plot between 'total_bill' and 'tip' to see their relationship visually
sns.scatterplot(data=tips, x='total_bill', y='tip')
plt.title("Scatter plot between 'total_bill' and 'tip'")
plt.show()

# Compute the covariance between 'total_bill' and 'tip'
cov_matrix = np.cov(tips['total_bill'], tips['tip'])
cov_value = cov_matrix[0, 1]

print(f"Covariance matrix: {cov_matrix}")
print(f"Covariance between 'total_bill' and 'tip': {cov_value:.2f}")

In [None]:
import numpy as np
import seaborn as sns
from numpy.linalg import inv

# Load the tips dataset
tips = sns.load_dataset('tips')

# Create the design matrix X with an intercept
X = np.column_stack((np.ones(tips.shape[0]), tips['total_bill']))

# Compute the Hat matrix
H = X @ inv(X.T @ X) @ X.T

# For a visual representation, we can display a portion of the Hat matrix
print(H[:5, :5])

In [17]:
print(H.shape)

(244, 244)


In [None]:
resul = H*H
rest = resul - H
print(rest)