###### Let's start with importing what we need...  

In [None]:
import pandas as pd # type: ignore
import statsmodels.api as sm # type: ignore
from statsmodels.formula.api import ols # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import r2_score # type: ignore

###### Let's load the publicly available diabetes dataset and print out a description of the dataset

###### Your task is to build the best linear regression model you can using this data to predict the 'target' field.

#### Diabetes dataset

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.
Data Set Characteristics
Number of Instances: 442
Number of Attributes
First 10 columns are numeric predictive values
Target: Column 11 is a quantitative measure of disease progression one year after baseline
Attribute Information
 - age:     age in years <br>
  - sex:     sex <br>
  - bmi:     body mass index <br>
  - bp:      average blood pressure <br>
  - s1:      tc, total serum cholesterol <br>
  - s2:      ldl, low-density lipoproteins <br>
  - s3:      hdl, high-density lipoproteins <br>
  - s4:      tch, total cholesterol / HDL <br>
  - s5:      ltg, possibly log of serum triglycerides level <br>
  - s6:      glu, blood sugar level <br>
Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).
Source URL
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499. (https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)


Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
Data URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt
Note: The Data URL mentioned-above is obtained from the source URL. The source URL provides detailed information about the dataset, variables and also reference links including the dataset link.



##### Read in data into a dataframe then print the dataframe head.

In [None]:
 #Use this URL to read in the data into a pandas dataframe called "df".
#Hint: set sep="\t" when reading in the csv file. 
# YOUR CODE HERE
df = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep="\t")
df.head()

###### Basic field information

In [None]:
df.info()

###### Convert sex to a categorical variable

In [None]:
# YOUR CODE HERE
df['SEX'] = df['SEX'].astype('category')


In [None]:
df.info()

###### Next, examine the dataframe

In [None]:
#Use Panda's describe function to peak into the dataframe.
# be sure to include the parameter include= "all" to grab the 
# columns where the statistic is Inappropriate for the datatype. 
# set the output to "dfDescription' so the result can be printed"
# YOUR CODE HERE
dfDescription = df.describe(include='all')
print (dfDescription)

###### Split dataframe into train and test subsets

In [None]:
# Use train_test_split() to split the train and test sets.
# Set test_size to 0.3
# Set random_state to 42
# The train and tests sets need to be called "df_train" and "df_test".
# YOUR CODE HERE
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

###### Fit Multilinear OLS regression model using training dataset and save the result in 'est_train' variable. 
Print model summary

In [None]:
# YOUR CODE HERE
X = df_train.drop(columns=['Y'])
y = df_train['Y']
X = sm.add_constant(X)
est_train = sm.OLS(y, X).fit()
print(est_train.summary())

###### Extract non significant coef (p< .05: SEX + BMI + S3 + S5), rerun model.

In [None]:
# YOUR CODE HERE
X = df.drop(columns=['SEX', 'BMI', 'S3', 'S5'])
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
est_train = ols(formula="Y ~  SEX + BMI  + S3 + S5 ", data=df_train).fit()
print(est_train.summary())

###### How well does it do on the test data? Lets use the model we trained on the training data to make predictions on the test data and then measure the R^2

In [None]:
# Set the r2 results to "r2"
# YOUR CODE HERE
test_pred = est_train.predict(df_test)
r2 = r2_score(df_test['Y'], test_pred)
print('OOS R-squared: ' + str(r2))