# Python Sklearn API

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Salary.csv')
df.head()

Unnamed: 0,Country,Experience_year,Salary_usd,change_company_a_lot
0,usa,1.1,36154,0
1,finland,1.3,46205,1
2,england,1.5,37731,1
3,usa,2.0,43525,0
4,france,,39891,0


In [4]:
df.shape

(35, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country               34 non-null     object 
 1   Experience_year       34 non-null     float64
 2   Salary_usd            35 non-null     int64  
 3   change_company_a_lot  35 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 1.2+ KB


In [6]:
df.isnull().sum()

Country                 1
Experience_year         1
Salary_usd              0
change_company_a_lot    0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df['Country'].unique()

array(['usa', 'finland', 'england', 'france', 'germany', 'norway', nan],
      dtype=object)

# My Transformer

In [9]:
import requests
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler

In [10]:
class ExchangeToManat(BaseEstimator,TransformerMixin):
    def fit(self,X,Y=None):
        url = "https://www.yelo.az/az/exchange-rates/"
        headers = {"User-Agent":"Chrome/90.0.4430.93"}
        page = requests.get(url,headers=headers)
        soup = BeautifulSoup(page.text,'html.parser')
        currency = soup.find('div',class_ = "td_c_item table_sell_rate")
        self.__currency = float(currency.get('data-nocash'))
        
        return self
    def transform(self,X):
        X_new = X.copy()
        X_new*= self.__currency
        
        return X_new

In [11]:
exc = ExchangeToManat()
exc.fit(df.Salary_usd)
df['Salary_azn'] = exc.transform(df['Salary_usd'])
df.head()

Unnamed: 0,Country,Experience_year,Salary_usd,change_company_a_lot,Salary_azn
0,usa,1.1,36154,0,61516.031
1,finland,1.3,46205,1,78617.8075
2,england,1.5,37731,1,64199.2965
3,usa,2.0,43525,0,74057.7875
4,france,,39891,0,67874.5365


# Building Pipeline

In [12]:
df = pd.read_csv("Salary.csv")
df.head()

Unnamed: 0,Country,Experience_year,Salary_usd,change_company_a_lot
0,usa,1.1,36154,0
1,finland,1.3,46205,1
2,england,1.5,37731,1
3,usa,2.0,43525,0
4,france,,39891,0


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [14]:
X = df.drop(df.iloc[:,3:4],axis=1)
y = df.iloc[:,3:4]

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
X_train.columns

Index(['Country', 'Experience_year', 'Salary_usd'], dtype='object')

In [17]:
y_train.columns

Index(['change_company_a_lot'], dtype='object')

In [18]:
categorical_columns = X.select_dtypes(include='object').columns

In [19]:
numerical_columns = X.select_dtypes(exclude='object').columns

In [20]:
numerical_columns

Index(['Experience_year', 'Salary_usd'], dtype='object')

In [21]:
categorical_columns

Index(['Country'], dtype='object')

# Using Pipeline

In [22]:
numerical_Pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [23]:
categorical_Pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

In [24]:
preprocessor = ColumnTransformer(
transformers=[
    ('categorical',categorical_Pipeline,categorical_columns),
    ('numerical',numerical_Pipeline,numerical_columns)
])

In [25]:
pipe = Pipeline(steps=[
    ('preprocessor',preprocessor)
])

In [26]:
pipe.fit(X_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Country'], dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                 

In [27]:
import pipename as pn
columns = pn.get_feature_names(preprocessor)
columns

['encoder__x0_england',
 'encoder__x0_finland',
 'encoder__x0_france',
 'encoder__x0_germany',
 'encoder__x0_norway',
 'encoder__x0_usa',
 'numerical__Experience_year',
 'numerical__Salary_usd']

In [28]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

In [29]:
X_train_final = pd.DataFrame(X_train,columns=columns)
X_test_final = pd.DataFrame(X_test,columns=columns)

In [30]:
X_train_final.head()

Unnamed: 0,encoder__x0_england,encoder__x0_finland,encoder__x0_france,encoder__x0_germany,encoder__x0_norway,encoder__x0_usa,numerical__Experience_year,numerical__Salary_usd
0,0.0,1.0,0.0,0.0,0.0,0.0,-0.593695,-0.726798
1,0.0,0.0,0.0,0.0,1.0,0.0,-0.809948,-0.500913
2,1.0,0.0,0.0,0.0,0.0,0.0,-0.296347,-0.45313
3,0.0,0.0,0.0,0.0,0.0,1.0,-0.674789,-0.719799
4,0.0,0.0,0.0,0.0,0.0,1.0,1.81212,1.647827


### Using xg boost to see perform the model

In [31]:
import xgboost as xgb
model = xgb.XGBClassifier(random_state=42,learning_rate=0.01)
model.fit(X_train,y_train)
model.predict(X_test)

array([1, 0, 0, 0, 0, 0, 0])

In [32]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [33]:
print(f"Confusion Matrix for XGBoost Classifier:\n{confusion_matrix(y_test,model.predict(X_test))}")
print(f"Accuracy Score for XGBoost Classifier:\n{accuracy_score(y_test,model.predict(X_test))}")

Confusion Matrix for XGBoost Classifier:
[[3 0]
 [3 1]]
Accuracy Score for XGBoost Classifier:
0.5714285714285714


# Another Example

In [34]:
titanic_url = ('https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
df = pd.read_csv(titanic_url)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [35]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [37]:
for cat in ['embarked','sex','pclass']:
    df[cat].fillna('missing',inplace=True)
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64

In [38]:
numeric_columns = ['age','fare']
categorical_columns = ['embarked','sex','pclass']

In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [40]:
numerical_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

In [41]:
categorical_pipeline = Pipeline(steps=[
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

In [42]:
preprocessor = ColumnTransformer(
transformers=[
    ('num',numerical_pipeline,numeric_columns),
    ('cat',categorical_pipeline,categorical_columns)
])

In [45]:
import xgboost as xgb
myPipeLine = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',xgb.XGBClassifier(random_state=42))
])

In [46]:
myPipeLine.fit(X_train,y_train)

ValueError: A given column is not a column of the dataframe

In [72]:
from sklearn.linear_model import LogisticRegression

In [None]:
logModel = LogisticRegression()
logModel.fit(X_train,y_train)
log_model

In [None]:
# 