<a href="https://colab.research.google.com/github/khadi1/machine_learning_practice/blob/main/Sklearn_getting_started_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Fitting and predicting: estimator basics

In [1]:
from sklearn.ensemble import RandomForestClassifier
clf  = RandomForestClassifier(random_state = 0)
X  = [[1,2,3] , [11,12,13]]
y  = [0 , 1]

clf.fit(X , y)
clf.predict([[4, 5, 6], [14, 15, 16]])

array([0, 1])

#Transformers and pre-processors

In [2]:
from sklearn.preprocessing  import StandardScaler

X = [[0  , 15] , 
     [1 , -10]]

StandardScaler().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

# Pipelines: chaining pre-processors and estimators

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split



pipe  = make_pipeline (
    StandardScaler() ,
    LogisticRegression()
)


X ,y  =  load_iris(return_X_y = True)
X_train , X_val , y_train, y_val =train_test_split( X, y ,random_state = 0)


pipe.fit(X_train , y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [4]:
accuracy_score(pipe.predict(X_val) , y_val)

0.9736842105263158

In [5]:
X  ,y = load_iris( return_X_y = True)
X.shape , y.shape

((150, 4), (150,))

In [6]:
X_train , X_test , y_train , y_test  = train_test_split(X , y  , test_size= 0.4 , random_state=0)
X_train.shape, y_train.shape

((90, 4), (90,))

In [7]:
X_test.shape, y_test.shape

((60, 4), (60,))

In [8]:
from sklearn import svm

clf = svm.SVC( kernel='linear' , C=1).fit(X_train, y_train)
clf.score(X_test , y_test)

0.9666666666666667

## Model evaluation

In [10]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

X, y = make_regression(n_samples= 1000 , random_state= 42)
lr = LinearRegression()
scores  = cross_validate(lr, X , y , cv= 15)
scores['test_score']


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

# Automatic parameter searches

In [16]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X, y = fetch_california_housing(return_X_y= True)
X_train , X_test , y_train  , y_test  = train_test_split(X , y, random_state= 0)

param_distributions = {'n_estimators' : randint(1,5) ,
                       'max_depth' : randint(5,10)}

search = RandomizedSearchCV(estimator = RandomForestRegressor(random_state= 0),
                            n_iter = 5 , 
                            param_distributions= param_distributions,
                            random_state= 0)



In [17]:
search.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f48a63ac640>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f48a0b32f10>},
                   random_state=0)

In [18]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [22]:
search.predict(X_test)
search.score(X_test, y_test)

0.735363411343253