In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Предобработка данных

In [3]:
dataset = pd.read_csv("drive/MyDrive/smoking_driking_dataset_Ver01.csv")

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991346 entries, 0 to 991345
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sex               991346 non-null  object 
 1   age               991346 non-null  int64  
 2   height            991346 non-null  int64  
 3   weight            991346 non-null  int64  
 4   waistline         991346 non-null  float64
 5   sight_left        991346 non-null  float64
 6   sight_right       991346 non-null  float64
 7   hear_left         991346 non-null  float64
 8   hear_right        991346 non-null  float64
 9   SBP               991346 non-null  float64
 10  DBP               991346 non-null  float64
 11  BLDS              991346 non-null  float64
 12  tot_chole         991346 non-null  float64
 13  HDL_chole         991346 non-null  float64
 14  LDL_chole         991346 non-null  float64
 15  triglyceride      991346 non-null  float64
 16  hemoglobin        99

In [5]:
dataset = dataset.drop_duplicates()
dataset.replace({'sex': {'Male': 0, 'Female': 1}}, inplace = True)
dataset.replace({'DRK_YN':{'N': 0, 'Y': 1}}, inplace = True)

# Линейная регрессия



## Одна переменная

In [6]:
number = 0
results = {};
while number < 23:
  X = dataset.iloc[:, [number]]
  y = dataset.iloc[:, -1]
  # Train-Test Split
  X_train, X_test, y_train, y_test = \
  train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
  # Building and fitting a Linear Model
  lreg = LinearRegression()
  lreg.fit(X_train, y_train)
  LinearRegression()
  # Making Prediction
  predict_train = lreg.predict(X_train)
  predict_test = lreg.predict(X_test)
  # Checking model
  results[X.columns[0]] = r2_score(y_test, predict_test)
  print(list(X.columns)[0])
  print("MSE (training):", mean_squared_error(y_train, predict_train))
  print("MSE (testing):", mean_squared_error(y_test, predict_test))
  print("r^2 (training):", r2_score(y_train, predict_train))
  print("r^2 (testing):", r2_score(y_test, predict_test), "\n")
  number +=1
sorted_results = sorted(results.items(), key=lambda x:x[1], reverse=True)
print(*sorted_results, sep='\n')

sex
MSE (training): 0.21596167111791598
MSE (testing): 0.21612682900664468
r^2 (training): 0.1361532239000045
r^2 (testing): 0.13549248687859916 

age
MSE (training): 0.22981205723546444
MSE (testing): 0.22961458194386014
r^2 (training): 0.08075167355336155
r^2 (testing): 0.0815414628297082 

height
MSE (training): 0.21486826739347056
MSE (testing): 0.21506138979244238
r^2 (training): 0.14052683926169607
r^2 (testing): 0.13975424470702558 

weight
MSE (training): 0.23251557816543905
MSE (testing): 0.2325929096750212
r^2 (training): 0.06993758868641187
r^2 (testing): 0.06962814918900617 

waistline
MSE (training): 0.24765495104324844
MSE (testing): 0.24830450558814765
r^2 (training): 0.009380090751833214
r^2 (testing): 0.006781751208458697 

sight_left
MSE (training): 0.2485288042209412
MSE (testing): 0.24844436311283194
r^2 (training): 0.005884677670303318
r^2 (testing): 0.006222320982179896 

sight_right
MSE (training): 0.2487006539856207
MSE (testing): 0.24837857743666916
r^2 (traini

## Две переменные

In [7]:
results.clear()

### Рост и пол

In [8]:
# Getting data for model
X = dataset.iloc[:, [0, 2]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["height + sex"] = r2_score(y_test, predict_test)

Index(['sex', 'height'], dtype='object')
coefficients:  [-0.20571654  0.01220422]
intercept:  -1.383778885660796
MSE (training): 0.2098326198054949
MSE (testing): 0.2100542280135262
r^2 (training): 0.16066943175012594
r^2 (testing): 0.1597828963889243


### Рост и курение

In [9]:
# Getting data for model
X = dataset.iloc[:, [2, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["height + smoking"] = r2_score(y_test, predict_test)

Index(['height', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [0.01453073 0.13759481]
intercept:  -2.078878967535772
MSE (training): 0.2049527550583919
MSE (testing): 0.20526731355765687
r^2 (training): 0.18018889280896955
r^2 (testing): 0.17893055857778306


### Рост и гемоглобин

In [10]:
# Getting data for model
X = dataset.iloc[:, [2, 16]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["height + hemoglobin"] = r2_score(y_test, predict_test)

Index(['height', 'hemoglobin'], dtype='object')
coefficients:  [0.0161908  0.04414014]
intercept:  -2.7550848098690097
MSE (training): 0.21136059060982418
MSE (testing): 0.21159878513475053
r^2 (training): 0.15455754788452059
r^2 (testing): 0.15360466649548266


### Рост и возраст

In [11]:
# Getting data for model
X = dataset.iloc[:, [1, 2]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["height + age"] = r2_score(y_test, predict_test)

Index(['age', 'height'], dtype='object')
coefficients:  [-0.00565447  0.01675866]
intercept:  -1.9497859069602446
MSE (training): 0.2094576207238392
MSE (testing): 0.2095454000080171
r^2 (training): 0.16216942823585356
r^2 (testing): 0.16181820887498144


### Рост и вес

In [12]:
# Getting data for model
X = dataset.iloc[:, [2, 3]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["height + weight"] = r2_score(y_test, predict_test)

Index(['height', 'weight'], dtype='object')
coefficients:  [0.0192951  0.00100127]
intercept:  -2.693996272439764
MSE (training): 0.21478140402680346
MSE (testing): 0.21497828745909364
r^2 (training): 0.1408742927652189
r^2 (testing): 0.1400866541162049


### Рост и гамма-ГТ

In [13]:
# Getting data for model
X = dataset.iloc[:, [2, 21]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["height + gamma_GTP"] = r2_score(y_test, predict_test)

Index(['height', 'gamma_GTP'], dtype='object')
coefficients:  [0.01890205 0.0014767 ]
intercept:  -2.621686750740441
MSE (training): 0.2094655661275408
MSE (testing): 0.20984983746967426
r^2 (training): 0.16213764661767605
r^2 (testing): 0.16060045875072404


### Пол и курение

In [14]:
# Getting data for model
X = dataset.iloc[:, [0, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + smoking"] = r2_score(y_test, predict_test)

Index(['sex', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [-0.24856446  0.12491513]
intercept:  0.41549910399915896
MSE (training): 0.20916492703826484
MSE (testing): 0.20937773864156278
r^2 (training): 0.1633402031023352
r^2 (testing): 0.162488854493696


### Пол и гемоглобин

In [15]:
# Getting data for model
X = dataset.iloc[:, [0, 16]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + hemoglobin"] = r2_score(y_test, predict_test)

Index(['sex', 'hemoglobin'], dtype='object')
coefficients:  [-0.30569492  0.03013271]
intercept:  0.21433570596417623
MSE (training): 0.21470325066579604
MSE (testing): 0.2148990422629692
r^2 (training): 0.1411869062424076
r^2 (testing): 0.1404036349729697


### Пол и возраст

In [16]:
# Getting data for model
X = dataset.iloc[:, [0, 1]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + age"] = r2_score(y_test, predict_test)

Index(['sex', 'age'], dtype='object')
coefficients:  [-0.34863799 -0.00900976]
intercept:  1.0923890131525462
MSE (training): 0.19974922479453325
MSE (testing): 0.1998608349545126
r^2 (training): 0.20100301607216076
r^2 (testing): 0.20055647792074727


### Пол и вес

In [17]:
# Getting data for model
X = dataset.iloc[:, [0, 3]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + weight"] = r2_score(y_test, predict_test)

Index(['sex', 'weight'], dtype='object')
coefficients:  [-0.32573337  0.00301756]
intercept:  0.4615480532169444
MSE (training): 0.21501702955741545
MSE (testing): 0.2152080801391254
r^2 (training): 0.13993179054279958
r^2 (testing): 0.1391674831865206


### Пол и гамма-ГТ

In [18]:
# Getting data for model
X = dataset.iloc[:, [0, 21]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + gamma_GTP"] = r2_score(y_test, predict_test)

Index(['sex', 'gamma_GTP'], dtype='object')
coefficients:  [-0.33918084  0.0011764 ]
intercept:  0.6151341130523303
MSE (training): 0.2126733148726271
MSE (testing): 0.21295385373802164
r^2 (training): 0.1493066502763456
r^2 (testing): 0.14818439084665613


### Курение и гемоглобин

In [19]:
# Getting data for model
X = dataset.iloc[:, [22, 16]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["smoking + hemoglobin"] = r2_score(y_test, predict_test)

Index(['SMK_stat_type_cd', 'hemoglobin'], dtype='object')
coefficients:  [0.1655995  0.05583038]
intercept:  -0.5608597613709936
MSE (training): 0.21295060360280926
MSE (testing): 0.21326017048361354
r^2 (training): 0.14819749523796877
r^2 (testing): 0.1469591235849459


### Курение и возраст

In [20]:
# Getting data for model
X = dataset.iloc[:, [22, 1]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["smoking + age"] = r2_score(y_test, predict_test)

Index(['SMK_stat_type_cd', 'age'], dtype='object')
coefficients:  [ 0.19585343 -0.0086027 ]
intercept:  0.5947043142716149
MSE (training): 0.20452254766146238
MSE (testing): 0.20464657982668186
r^2 (training): 0.18190972257921623
r^2 (testing): 0.1814134940677553


### Курение и вес

In [21]:
# Getting data for model
X = dataset.iloc[:, [22, 3]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["smoking + weight"] = r2_score(y_test, predict_test)

Index(['SMK_stat_type_cd', 'weight'], dtype='object')
coefficients:  [0.17944718 0.00627119]
intercept:  -0.18553083558250333
MSE (training): 0.2138345398029138
MSE (testing): 0.2141138052556418
r^2 (training): 0.14466175006251358
r^2 (testing): 0.14354458371836865


### Курение и гамма-ГТП

In [22]:
# Getting data for model
X = dataset.iloc[:, [22, 21]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["smoking + gamma_GTP"] = r2_score(y_test, predict_test)

Index(['SMK_stat_type_cd', 'gamma_GTP'], dtype='object')
coefficients:  [0.19555814 0.00126643]
intercept:  0.13840318134026697
MSE (training): 0.2153314443551505
MSE (testing): 0.21567112847822603
r^2 (training): 0.1386741312184594
r^2 (testing): 0.13731528940784554


### Гемоглобин и возраст

In [23]:
# Getting data for model
X = dataset.iloc[:, [16, 1]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["hemoglobin + age"] = r2_score(y_test, predict_test)

Index(['hemoglobin', 'age'], dtype='object')
coefficients:  [ 0.08153736 -0.00845033]
intercept:  -0.25790944523105036
MSE (training): 0.21361991861073198
MSE (testing): 0.21366887764436893
r^2 (training): 0.14552023492230048
r^2 (testing): 0.1453242945692077


### Гемоглобин и вес

In [24]:
# Getting data for model
X = dataset.iloc[:, [16, 3]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["hemoglobin + weight"] = r2_score(y_test, predict_test)

Index(['hemoglobin', 'weight'], dtype='object')
coefficients:  [0.07046338 0.00611232]
intercept:  -0.8896321776229776
MSE (training): 0.2231587982164336
MSE (testing): 0.22333445302379754
r^2 (training): 0.10736471245233314
r^2 (testing): 0.10666198423706308


### Гемоглобин и гамма-ГТП

In [25]:
# Getting data for model
X = dataset.iloc[:, [16, 21]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["hemoglobin + gamma_GTP"] = r2_score(y_test, predict_test)

Index(['hemoglobin', 'gamma_GTP'], dtype='object')
coefficients:  [0.08421668 0.00144148]
intercept:  -0.7520517101344628
MSE (training): 0.22253581027707137
MSE (testing): 0.22283774761966055
r^2 (training): 0.1098566644741038
r^2 (testing): 0.1086488063065768


### Возраст и вес

In [26]:
# Getting data for model
X = dataset.iloc[:, [1, 3]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["age + weight"] = r2_score(y_test, predict_test)

Index(['age', 'weight'], dtype='object')
coefficients:  [-0.00852794  0.00868445]
intercept:  0.35646637768484774
MSE (training): 0.21844563457863891
MSE (testing): 0.21840268233631135
r^2 (training): 0.12621736900321545
r^2 (testing): 0.1263890714844894


### Возраст и гамма-ГТП

In [27]:
# Getting data for model
X = dataset.iloc[:, [1, 21]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["age + gamma_GTP"] = r2_score(y_test, predict_test)

Index(['age', 'gamma_GTP'], dtype='object')
coefficients:  [-0.01015173  0.00208888]
intercept:  0.9058292386398789
MSE (training): 0.21871559620039233
MSE (testing): 0.21873090278858914
r^2 (training): 0.12513752240166232
r^2 (testing): 0.1250761893760608


### Вес и гамма-ГТП

In [28]:
# Getting data for model
X = dataset.iloc[:, [3, 21]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["weight + gamma_GTP"] = r2_score(y_test, predict_test)

Index(['weight', 'gamma_GTP'], dtype='object')
coefficients:  [0.0091974  0.00153292]
intercept:  -0.13910245499550639
MSE (training): 0.22683087338561211
MSE (testing): 0.22705832012105126
r^2 (training): 0.09267641021762918
r^2 (testing): 0.09176651245210266


### Результаты

In [29]:
sorted_results = sorted(results.items(), key=lambda x:x[1], reverse=True)
print(*sorted_results, sep='\n')

('sex + age', 0.20055647792074727)
('smoking + age', 0.1814134940677553)
('height + smoking', 0.17893055857778306)
('sex + smoking', 0.162488854493696)
('height + age', 0.16181820887498144)
('height + gamma_GTP', 0.16060045875072404)
('height + sex', 0.1597828963889243)
('height + hemoglobin', 0.15360466649548266)
('sex + gamma_GTP', 0.14818439084665613)
('smoking + hemoglobin', 0.1469591235849459)
('hemoglobin + age', 0.1453242945692077)
('smoking + weight', 0.14354458371836865)
('sex + hemoglobin', 0.1404036349729697)
('height + weight', 0.1400866541162049)
('sex + weight', 0.1391674831865206)
('smoking + gamma_GTP', 0.13731528940784554)
('age + weight', 0.1263890714844894)
('age + gamma_GTP', 0.1250761893760608)
('hemoglobin + gamma_GTP', 0.1086488063065768)
('hemoglobin + weight', 0.10666198423706308)
('weight + gamma_GTP', 0.09176651245210266)


## Три переменные

In [30]:
results.clear()

### Пол, возраст, курение

In [31]:
# Getting data for model
X = dataset.iloc[:, [0, 1, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + age + smoking"] = r2_score(y_test, predict_test)

Index(['sex', 'age', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [-0.24551168 -0.00853082  0.10748607]
intercept:  0.8484262029866554
MSE (training): 0.1947626377656694
MSE (testing): 0.1949282117575405
r^2 (training): 0.22094936630332784
r^2 (testing): 0.22028697520689489


### Рост, возраст, курение

In [32]:
# Getting data for model
X = dataset.iloc[:, [1, 2, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["age + smoking + height"] = r2_score(y_test, predict_test)

Index(['age', 'height', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [-0.00626041  0.01037723  0.1459906 ]
intercept:  -1.1203137670016394
MSE (training): 0.19835727057519453
MSE (testing): 0.1985840074732431
r^2 (training): 0.20657083354009464
r^2 (testing): 0.20566378901021598


### Пол, возраст, рост

In [33]:
# Getting data for model
X = dataset.iloc[:, [0, 1, 2]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + age + height"] = r2_score(y_test, predict_test)

Index(['sex', 'age', 'height'], dtype='object')
coefficients:  [-0.30691752 -0.00828924  0.00323013]
intercept:  0.5144533596003427
MSE (training): 0.1994235576563591
MSE (testing): 0.19955240881083966
r^2 (training): 0.20230568476303146
r^2 (testing): 0.20179018277670535


### Пол, рост, курение

In [34]:
# Getting data for model
X = dataset.iloc[:, [0, 2, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["sex + height + smoking"] = r2_score(y_test, predict_test)

Index(['sex', 'height', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [-0.10370557  0.01134667  0.11706255]
intercept:  -1.4806727741621173
MSE (training): 0.20389380975017904
MSE (testing): 0.204186699716167
r^2 (training): 0.18442467449111077
r^2 (testing): 0.183253014929198


### Рост, курение, гамма-ГТ

In [35]:
# Getting data for model
X = dataset.iloc[:, [21, 2, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))
results["height + smoking + gamma_GTP"] = r2_score(y_test, predict_test)

Index(['gamma_GTP', 'height', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [0.00113394 0.01415343 0.12259737]
intercept:  -2.0356485472278165
MSE (training): 0.2018848606836533
MSE (testing): 0.2023313312021792
r^2 (training): 0.1924604716095718
r^2 (testing): 0.19067449067713504


### Результаты

In [36]:
sorted_results = sorted(results.items(), key=lambda x:x[1], reverse=True)
print(*sorted_results, sep='\n')

('sex + age + smoking', 0.22028697520689489)
('age + smoking + height', 0.20566378901021598)
('sex + age + height', 0.20179018277670535)
('height + smoking + gamma_GTP', 0.19067449067713504)
('sex + height + smoking', 0.183253014929198)


## Четыре переменные

### Пол, возраст, курение, рост

In [37]:
# Getting data for model
X = dataset.iloc[:, [0, 1, 2, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))

Index(['sex', 'age', 'height', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [-0.20899912 -0.00789192  0.00287791  0.10679969]
intercept:  0.3350687942117482
MSE (training): 0.1945043256362473
MSE (testing): 0.19469132666613093
r^2 (training): 0.2219826149306131
r^2 (testing): 0.22123451578855824


## Пять переменных

### Пол, возраст, курение, рост, гамма-ГТ

In [38]:
# Getting data for model
X = dataset.iloc[:, [0, 1, 2, 21, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))

Index(['sex', 'age', 'height', 'gamma_GTP', 'SMK_stat_type_cd'], dtype='object')
coefficients:  [-0.18627722 -0.00804671  0.00309613  0.00113215  0.09653212]
intercept:  0.2708645891886332
MSE (training): 0.19151021377347696
MSE (testing): 0.1918187436971889
r^2 (training): 0.23395906365203778
r^2 (testing): 0.2327248502839513


## Семь переменных

### Рост, пол, курение, гемоглобин, возраст, вес, гамма-ГТ

In [39]:
# Getting data for model
X = dataset.iloc[:, [0, 1, 2, 3, 16, 21, 22]]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))

Index(['sex', 'age', 'height', 'weight', 'hemoglobin', 'gamma_GTP',
       'SMK_stat_type_cd'],
      dtype='object')
coefficients:  [-0.17888333 -0.00795006  0.00370013 -0.0008425   0.00582537  0.00114085
  0.09568725]
intercept:  0.13625819101326936
MSE (training): 0.19142289148705485
MSE (testing): 0.19173704673445272
r^2 (training): 0.23430835283477536
r^2 (testing): 0.23305163820939878


## Все переменные

In [40]:
# Getting data for model
X = dataset.iloc[:, :23]
y = dataset.iloc[:, -1]
print(X.columns)
# Train-Test Split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)
# Building and fitting a Linear Model
lreg = LinearRegression()
lreg.fit(X_train, y_train)
LinearRegression()
# Making Prediction
predict_train = lreg.predict(X_train)
predict_test = lreg.predict(X_test)
# Checking model
print("coefficients: ", lreg.coef_)
print("intercept: ", lreg.intercept_)
print("MSE (training):", mean_squared_error(y_train, predict_train))
print("MSE (testing):", mean_squared_error(y_test, predict_test))
print("r^2 (training):", r2_score(y_train, predict_train))
print("r^2 (testing):", r2_score(y_test, predict_test))

Index(['sex', 'age', 'height', 'weight', 'waistline', 'sight_left',
       'sight_right', 'hear_left', 'hear_right', 'SBP', 'DBP', 'BLDS',
       'tot_chole', 'HDL_chole', 'LDL_chole', 'triglyceride', 'hemoglobin',
       'urine_protein', 'serum_creatinine', 'SGOT_AST', 'SGOT_ALT',
       'gamma_GTP', 'SMK_stat_type_cd'],
      dtype='object')
coefficients:  [-2.12459216e-01 -7.66191282e-03  2.61809331e-03  7.77881429e-04
 -5.67308173e-06  5.14066612e-04 -3.06316148e-03 -7.82434269e-03
 -2.51152109e-03 -2.27084260e-04  2.47556133e-03  1.32986456e-04
  2.21236316e-03  2.32230109e-03 -2.59903185e-03 -1.49165858e-04
  5.11836316e-03 -5.39347495e-03 -1.67852699e-02  5.53834740e-04
 -1.45293159e-03  1.09236619e-03  9.42580229e-02]
intercept:  -0.14277218098099437
MSE (training): 0.1857764519724606
MSE (testing): 0.18628170730266602
r^2 (training): 0.2568941132888267
r^2 (testing): 0.2548730009114908
