In [29]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import statsmodels.formula.api as smf
import plotly.express as px
from sklearn import tree
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
import graphviz

In [None]:
df = pd.read_csv('/content/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

## Классификация

In [None]:
X=df.drop(['Outcome'],axis=1)
Y=df['Outcome']

In [None]:
n_neighbors = [5,10,15,20,25]
scores = []
for k in n_neighbors:
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0, stratify=Y) #стратификация ввиду дисбаланса классов
  model_kn = KNeighborsClassifier(n_neighbors = k)
  model_kn.fit(X_train,y_train)
  y_pred = model_kn.predict(X_test)
  scores.append(round(accuracy_score(y_pred,y_test),3))

In [None]:
from plotly.graph_objs import YAxis
fig = px.line(pd.DataFrame({'Кол-во соседей':n_neighbors, 'Точность':scores}), x='Кол-во соседей', y='Точность')
fig.show()

## Регрессия

In [None]:
#реализация модели с помощью statsmodels
model = smf.ols(formula='Insulin ~ SkinThickness+BMI', data=df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                Insulin   R-squared:                       0.192
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     90.66
Date:                Fri, 29 Sep 2023   Prob (F-statistic):           4.64e-36
Time:                        04:41:39   Log-Likelihood:                -4653.3
No. Observations:                 768   AIC:                             9313.
Df Residuals:                     765   BIC:                             9327.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         2.2252     15.705      0.142

Коэф.детерминации - 0.192 (очень плохо)

In [None]:
#уравнение
def get_result(x1,x2):
  y = 3.07*x1+0.46*x2+2.23
  return y


print(get_result(10,10))

37.529999999999994


## Деревья решений

In [None]:
#немного потюним гиперпараметры
max_features = ['sqrt','log2',len(X.columns)]
max_depth = [3,5,7,10]
grid = dict(max_features=max_features,max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model =  tree.DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, Y)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.741798 using {'max_depth': 5, 'max_features': 8}
0.705707 (0.061787) with: {'max_depth': 3, 'max_features': 'sqrt'}
0.721799 (0.064108) with: {'max_depth': 3, 'max_features': 'log2'}
0.732194 (0.050767) with: {'max_depth': 3, 'max_features': 8}
0.713990 (0.046730) with: {'max_depth': 5, 'max_features': 'sqrt'}
0.734393 (0.048515) with: {'max_depth': 5, 'max_features': 'log2'}
0.741798 (0.048361) with: {'max_depth': 5, 'max_features': 8}
0.707439 (0.054100) with: {'max_depth': 7, 'max_features': 'sqrt'}
0.716131 (0.046108) with: {'max_depth': 7, 'max_features': 'log2'}
0.719594 (0.050494) with: {'max_depth': 7, 'max_features': 8}
0.691029 (0.048476) with: {'max_depth': 10, 'max_features': 'sqrt'}
0.701333 (0.043354) with: {'max_depth': 10, 'max_features': 'log2'}
0.704392 (0.063355) with: {'max_depth': 10, 'max_features': 8}


In [None]:
#сохраянем получившееся дерево
final_tree = tree.DecisionTreeClassifier(max_depth=5, max_features=8)
final_tree.fit(X,Y)
dot_data = tree.export_graphviz(final_tree, out_file=None,
                     feature_names=X.columns,
                     class_names=['no_diabet','diabet'],
                     filled=True, rounded=True,
                     special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("diabet")

'diabet.pdf'