# Python Scikit Learn API

In [43]:
from warnings import filterwarnings
filterwarnings("ignore")
%config IPCompleter.use_jedi=False

In [2]:
import pandas as pd
import numpy as np

In [45]:
df = pd.read_csv("Salary.csv")
df.head()

Unnamed: 0,Country,Experience_year,Salary_usd,change_company_a_lot
0,usa,1.1,36154,0
1,finland,1.3,46205,1
2,england,1.5,37731,1
3,usa,2.0,43525,0
4,france,,39891,0


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country               34 non-null     object 
 1   Experience_year       34 non-null     float64
 2   Salary_usd            35 non-null     int64  
 3   change_company_a_lot  35 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 1.2+ KB


### My Transformer

In [47]:
import requests 
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import StandardScaler

class ExchangerToManat(BaseEstimator,TransformerMixin):
     # TransofmerMixin provides fit_transform method
     # BaseEstimator is using GridSearch and Pipeline by using get and set_params methods
    def fit(self,X,Y=None):
        
        # get currency today # from Yelobank
        
        url ="https://www.yelo.az/az/exchange-rates/"
        headers={"User-Agent":"Chrome/90.0.4430.93"}
        page=requests.get(url,headers=headers)
        soup=BeautifulSoup(page.text,"html.parser")
        currency=soup.find("div",class_="td_c_item table_sell_rate")
        self.__currency=float(currency.get("data-nocash"))
        
        return self
    
    def transform(self,X):
        X_new=X.copy()
        X_new*=self.__currency
        
        return X_new

In [48]:
exc=ExchangerToManat()
exc=exc.fit(df.Salary_usd)
df["Salary_AZN"]=exc.transform(df.Salary_usd)
df.head()

Unnamed: 0,Country,Experience_year,Salary_usd,change_company_a_lot,Salary_AZN
0,usa,1.1,36154,0,61516.031
1,finland,1.3,46205,1,78617.8075
2,england,1.5,37731,1,64199.2965
3,usa,2.0,43525,0,74057.7875
4,france,,39891,0,67874.5365


### Or 

In [None]:
exc=ExchangerToManat()
df["Salary_AZN"]=exc.fit_transform(df.Salary_usd)
df.head()

In [49]:
exc._ExchangerToManat__currency

1.7015

### Using My Transformer 

## Build Pipeline

In [50]:
df = pd.read_csv("Salary.csv")
df.head()

Unnamed: 0,Country,Experience_year,Salary_usd,change_company_a_lot
0,usa,1.1,36154,0
1,finland,1.3,46205,1
2,england,1.5,37731,1
3,usa,2.0,43525,0
4,france,,39891,0


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [52]:
X=df.drop("change_company_a_lot",axis=1)
Y=df.change_company_a_lot
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.23,random_state=42)

In [53]:
X_train.columns

Index(['Country', 'Experience_year', 'Salary_usd'], dtype='object')

In [54]:
numeric_columns=X.select_dtypes(include="number").columns
categorical_columns=X.select_dtypes(include="object").columns

In [55]:
numeric_columns

Index(['Experience_year', 'Salary_usd'], dtype='object')

### Prepared codes for using columns on pipeline

In [56]:
import pipename as pn

### Pipeline 

In [57]:
numerical_Pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [58]:
categorical_Pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [59]:
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_Pipeline, categorical_columns),
        ('numerical', numerical_Pipeline, numeric_columns)
    ])


In [60]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [61]:
pipe.fit(X_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Country'], dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                 

In [64]:
columns=pn.get_feature_names(preprocessor)
columns

['encoder__x0_england',
 'encoder__x0_finland',
 'encoder__x0_france',
 'encoder__x0_germany',
 'encoder__x0_norway',
 'encoder__x0_usa',
 'numerical__Experience_year',
 'numerical__Salary_usd']

In [65]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

In [68]:
X_train_final=pd.DataFrame(X_train,columns=columns)
X_test_final=pd.DataFrame(X_test,columns=columns)
X_train_final.head()

Unnamed: 0,encoder__x0_england,encoder__x0_finland,encoder__x0_france,encoder__x0_germany,encoder__x0_norway,encoder__x0_usa,numerical__Experience_year,numerical__Salary_usd
0,1.0,0.0,0.0,0.0,0.0,0.0,-0.346464,-0.489522
1,0.0,0.0,0.0,0.0,0.0,1.0,-0.718434,-0.750421
2,0.0,0.0,0.0,0.0,0.0,1.0,1.725942,1.56597
3,0.0,0.0,0.0,0.0,0.0,1.0,-1.409236,-1.371236
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.260944


In [69]:
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model=model.fit(X_train,y_train)
y_pred=model.predict(X_test)
y_pred



array([0, 1, 0, 0, 1, 0, 1, 1, 1], dtype=int64)

In [70]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Confusion Matrix \n",confusion_matrix(y_test,y_pred))
accuracy_score(y_test,y_pred)

Confusion Matrix 
 [[1 4]
 [3 1]]


0.2222222222222222

### Another Example

In [71]:
from sklearn.linear_model import LogisticRegression

In [3]:
titanic_url = ('https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
X = data.drop('survived', axis=1)
y = data['survived']

In [4]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [74]:
X.isnull().sum()

pclass          0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [75]:
for cat in ['embarked', 'sex', 'pclass']:
    X[cat].fillna('missing', inplace=True)
X.isnull().sum()

pclass          0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

In [77]:
numeric_pipeline = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])


categorical_pipeline = Pipeline(
    steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) #handle_unknow is use for unknown values transform onehotencoding
    ])

In [78]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features),
    ])

mypipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(random_state=42))
                            
    ])

In [79]:
mypipeline.fit(X_train,y_train)
mypipeline.score(X_test,y_test)

0.7709923664122137

### One Hot Encoding handle-unknown

In [80]:
mydf=pd.DataFrame({
    "Country":["AA","AA","AB","AB","AC","AD"]
})
mydf.head(10)

Unnamed: 0,Country
0,AA
1,AA
2,AB
3,AB
4,AC
5,AD


In [81]:
newdf=mydf.iloc[:-2]
one=OneHotEncoder(handle_unknown='ignore').fit(newdf)
newdf=one.transform(newdf)
newdf.toarray()

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [82]:
newtestdf=mydf.iloc[[-1]]
newtestdf=one.transform(newtestdf)
newtestdf.toarray()

array([[0., 0.]])

### Feature Engine Transformers

In [83]:
#!pip install feature_engine

In [84]:
from feature_engine.imputation import MeanMedianImputer

mydf=pd.DataFrame({
    "numbers":[5,6,7,8,np.nan,9,10,np.nan]
})

myimputer=MeanMedianImputer(imputation_method='median',variables=["numbers"])
mydf=myimputer.fit_transform(mydf)

mydf.head(8)

Unnamed: 0,numbers
0,5.0
1,6.0
2,7.0
3,8.0
4,7.5
5,9.0
6,10.0
7,7.5


### <a href="https://github.com/trainindata/deploying-machine-learning-models/blob/master/section-04-research-and-development/preprocessors.py">Extra Transformer Example</a>

### <a href="https://github.com/trainindata/deploying-machine-learning-models/blob/master/section-04-research-and-development/07-feature-engineering-pipeline.ipynb">Extra Pipeline Project Example</a>

### Model Creating and Saving 

In [85]:
import pandas as pd 
diabetes=pd.read_csv("diabetes.csv")
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [86]:
X=diabetes.drop("Outcome",axis=1)
Y=diabetes.Outcome

In [87]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((576, 8), (192, 8), (576,), (192,))

In [88]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()
model=lr.fit(X_train,y_train)
y_pred=model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [89]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Confusion Matrix \n\n",confusion_matrix(y_test,y_pred),"\n")
print("Accuary Score" , '= %0.2f' % accuracy_score(y_test,y_pred),"%")

Confusion Matrix 

 [[95 28]
 [24 45]] 

Accuary Score = 0.73 %


### Model Saving 

### Exporting Model

In [90]:
import pickle
document="myModel"
pickle.dump(lr,open(document,"wb"))

### Importing Model

In [91]:
loaded_model=pickle.load(open(document,'rb'))
y_loded_model_pred=loaded_model.predict(X_test)

In [92]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Confusion Matrix \n\n",confusion_matrix(y_test,y_loded_model_pred),"\n")
print("Accuary Score" , '= %0.2f' % accuracy_score(y_test,y_loded_model_pred),"%")

Confusion Matrix 

 [[95 28]
 [24 45]] 

Accuary Score = 0.73 %


# After Kernel Restarting

### Loading

In [93]:
import pandas as pd 
diabetes=pd.read_csv("diabetes.csv")
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [94]:
X=diabetes.drop("Outcome",axis=1)
Y=diabetes.Outcome

In [95]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((576, 8), (192, 8), (576,), (192,))

In [96]:
import pickle
document="myModel"
loaded_model=pickle.load(open(document,'rb'))
y_loded_model_pred=loaded_model.predict(X_test)

In [97]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Confusion Matrix \n\n",confusion_matrix(y_test,y_loded_model_pred),"\n")
print("Accuary Score" , '= %0.2f' % accuracy_score(y_test,y_loded_model_pred),"%")

Confusion Matrix 

 [[95 28]
 [24 45]] 

Accuary Score = 0.73 %
