# Randomized Search CV

In [2]:
import pandas as pd
df=pd.read_csv('diabetes_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     10000 non-null  int64  
 1   Age                            10000 non-null  int64  
 2   Sex                            10000 non-null  object 
 3   Ethnicity                      10000 non-null  object 
 4   BMI                            10000 non-null  float64
 5   Waist_Circumference            10000 non-null  float64
 6   Fasting_Blood_Glucose          10000 non-null  float64
 7   HbA1c                          10000 non-null  float64
 8   Blood_Pressure_Systolic        10000 non-null  int64  
 9   Blood_Pressure_Diastolic       10000 non-null  int64  
 10  Cholesterol_Total              10000 non-null  float64
 11  Cholesterol_HDL                10000 non-null  float64
 12  Cholesterol_LDL                10000 non-null  

In [4]:
# Missing Values (For loop orqali)

In [5]:
for col in df.columns:
    if df[col].isnull().any():
        if df[col].dtype=='object':
            df[col].fillna(df[col].mode()[0],inplace=True)
        else:
            df[col].fillna(df[col].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     10000 non-null  int64  
 1   Age                            10000 non-null  int64  
 2   Sex                            10000 non-null  object 
 3   Ethnicity                      10000 non-null  object 
 4   BMI                            10000 non-null  float64
 5   Waist_Circumference            10000 non-null  float64
 6   Fasting_Blood_Glucose          10000 non-null  float64
 7   HbA1c                          10000 non-null  float64
 8   Blood_Pressure_Systolic        10000 non-null  int64  
 9   Blood_Pressure_Diastolic       10000 non-null  int64  
 10  Cholesterol_Total              10000 non-null  float64
 11  Cholesterol_HDL                10000 non-null  float64
 12  Cholesterol_LDL                10000 non-null  

In [10]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder

In [None]:
for col in df.columns:
    if df[col].nunique()<=5 and df[col].dtype=='object':
        dummies=pd.get_dummies(df[col],prefix=col,dtype='int')
        df=pd.concat([df.drop(col,axis=1),dummies],axis=1)
    else:
        encoder = LabelEncoder()     #    (  df[col]=encoder.fit_transform(df[col])   - o'rniga shu kodni ishlatish ham mumkin)        
        df[col] = encoder.fit_transform(df[col].astype(str))

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 30 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Unnamed: 0                        10000 non-null  int64
 1   Age                               10000 non-null  int64
 2   BMI                               10000 non-null  int64
 3   Waist_Circumference               10000 non-null  int64
 4   Fasting_Blood_Glucose             10000 non-null  int64
 5   HbA1c                             10000 non-null  int64
 6   Blood_Pressure_Systolic           10000 non-null  int64
 7   Blood_Pressure_Diastolic          10000 non-null  int64
 8   Cholesterol_Total                 10000 non-null  int64
 9   Cholesterol_HDL                   10000 non-null  int64
 10  Cholesterol_LDL                   10000 non-null  int64
 11  GGT                               10000 non-null  int64
 12  Serum_Urate                      

In [21]:
x=df.drop('Family_History_of_Diabetes',axis=1)

In [27]:
y=df['Family_History_of_Diabetes']

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [30]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
model=DecisionTreeClassifier(random_state=42)

In [32]:
model.fit(x_train,y_train)

In [33]:
y_pred=model.predict(x_test)

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
score=accuracy_score(y_test,y_pred)

In [38]:
print(score)

0.4815


In [39]:
model

# Randomized SearchCV

In [17]:
from sklearn.model_selection import RandomizedSearchCV

In [55]:
parametrs={
    'max_depth':[3,5,7,None],
    'min_samples_split':[3,5,10],
    'min_samples_leaf':[1,2,4],
    'max_features':['auto','sqrt','log2',None]
}

In [56]:
parametrs

{'max_depth': [3, 5, 7, None],
 'min_samples_split': [3, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'max_features': ['auto', 'sqrt', 'log2', None]}

In [57]:
type(parametrs)

dict

In [58]:
random_search=RandomizedSearchCV(
    estimator=model,
    param_distributions=parametrs,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    random_state=42
)

In [59]:
random_search

In [60]:
random_search.fit(x_train,y_train)

In [61]:
print('Best Parametrs:',random_search.best_params_)

Best Parametrs: {'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 3}


In [62]:
parametrs

{'max_depth': [3, 5, 7, None],
 'min_samples_split': [3, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'max_features': ['auto', 'sqrt', 'log2', None]}

In [63]:
best_model=random_search.best_estimator_

In [64]:
best_model

In [65]:
best_model.predict(x_test)

array([1, 1, 1, ..., 1, 0, 1], shape=(2000,))

In [66]:
y_pred=best_model.predict(x_test)

In [67]:
score=accuracy_score(y_test,y_pred)

In [68]:
score

0.5

# Gradient Boosting (scikit learn)

In [69]:
from sklearn.ensemble import GradientBoostingClassifier

In [85]:
gbc=GradientBoostingClassifier(n_estimators=20,learning_rate=0.2,max_depth=4,random_state=42)

In [86]:
gbc

In [87]:
gbc.fit(x_train,y_train)

In [88]:
y_pred=gbc.predict(x_test)

In [89]:
score=accuracy_score(y_test,y_pred)

In [90]:
print(score)

0.5125
