### P2

In [72]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
data = load_diabetes(as_frame=True)
print(data["DESCR"])
df = data["frame"]
df_train, df_test = train_test_split(df, test_size=0.2)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

#### Test independence hypothesis for every feature with target (10 tests total). Re- member the normality assumption! Don’t forget to account for multiple testing! Fit a linear regression model using features for which we reject the independence hypothesis. Measure RMSE on testing dataset.

In [73]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import f_regression
import statsmodels.api as sm
import numpy as np

In [74]:
features = df_train.drop(columns=['target'])
target = df_train['target']

In [75]:
stats, p_values = f_regression(features, target)

significant_features = features.columns[p_values * len(p_values) < 0.05]
significant_features.values

array(['age', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'],
      dtype=object)

In [76]:
X_train, Y_train = df_train[significant_features], df_train['target']
X_test, Y_test = df_test[significant_features], df_test['target']

model = LinearRegression().fit(X_train, Y_train)

Y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))

print('RMSE is: ', rmse)

RMSE is:  58.88387086315544


#### Train a regularized regression model with all features considered. Remember the normality assumption! Read the summary of your fit. Find the confidence intervals for every coefficient. Fit a new ordinary linear regression model excluding all features that have zero in the confidence interval. Measure RMSE on testing dataset.

In [77]:
X_train, Y_train = sm.add_constant(df_train.drop(columns=['target'])), df_train['target']

model = sm.OLS(Y_train, X_train).fit()
print(model.summary())

conf_intervals = model.conf_int()
exclude = conf_intervals[(conf_intervals[0] <= 0) & (conf_intervals[1] >= 0)].index.tolist()

print(conf_intervals)
print(exclude)

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.555
Model:                            OLS   Adj. R-squared:                  0.542
Method:                 Least Squares   F-statistic:                     42.71
Date:                Thu, 30 May 2024   Prob (F-statistic):           2.40e-54
Time:                        19:09:00   Log-Likelihood:                -1896.3
No. Observations:                 353   AIC:                             3815.
Df Residuals:                     342   BIC:                             3857.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        153.8780      2.824     54.489      0.0

In [78]:
if 'const' in exclude:
  exclude.remove('const')
exclude.append('target')

X_train, Y_train = df_train.drop(columns=exclude), df_train['target']
X_test, Y_test = df_test.drop(columns=exclude), df_test['target']

model = LinearRegression(fit_intercept=need_constant).fit(X_train, Y_train)

Y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))

print('RMSE of test set: ', rmse)

RMSE of test set:  59.25290437223776


### P3

In [79]:
from sklearn.datasets import load_wine
data = load_wine(as_frame=True)
df = data["frame"]
X = df["color_intensity"]
Y = df["hue"]
Z = df["flavanoids"]

In [80]:
import pandas as pd
import scipy.stats as stats

#### Find ρ(X, Y ) and p-value with SciPy’s built-in test

In [81]:
_r, p_value = stats.pearsonr(X, Y)
print('Pearson correlation is: ', _r)
print('p-value is:', p_value)

Pearson correlation is:  -0.5218131932287576
p-value is: 8.075008429978309e-14


#### Find ρ(X, Y ) with formula and p-value from the CDF of the t-distribution

In [82]:
_r_manual = (np.sum((X - np.mean(X)) * (Y - np.mean(Y)))) / (np.sqrt(np.sum((X - np.mean(X)) ** 2) * np.sum((Y - np.mean(Y)) ** 2)))
n = len(X)
t_stat = _r_manual * np.sqrt((n - 2) / (1 - _r_manual ** 2))
p_value_manual = 2 * (1 - stats.t.cdf(abs(t_stat), df=n - 2))
print('Manually calculated Pearson correlation coefficient is:', _r_manual)
print('Manually calculated p-value is:', p_value_manual)

Manually calculated Pearson correlation coefficient is: -0.5218131932287576
Manually calculated p-value is: 8.08242361927114e-14


#### Find ρS(X,Z) and p-value with ‘scipy.stats.spearmanr‘

In [83]:
spearman_r, spearman_p_value = stats.spearmanr(X, Z)
print('Spearman\'s rank correlation is: ', spearman_r)
print('p-value is: ', spearman_p_value)

Spearman's rank correlation is:  -0.04291038821273014
p-value is:  0.5695430180550238


#### Find τ(X,Z) and p-value with ‘scipy.stats.kendalltau‘

In [84]:
tau, p_value_tau = stats.kendalltau(X, Z)
print('Kendall\'s tau is: ', tau)
print('p-value is: ', p_value_tau)

Kendall's tau is:  0.028674293665247572
p-value is:  0.5712785725826517
