In [2]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import  DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [3]:
from data import restaurant_data

col_labels= ['Alternate', 'Bar', 'Friday', 'Hungry', 'Patron', 'Price', 'Raining', 'Reservation', 'Type', 'Estimated_Time']

d_frame = pd.DataFrame(restaurant_data).transpose()
dataFrame = d_frame.copy()
dataFrame.columns=col_labels
dataFrame

Unnamed: 0,Alternate,Bar,Friday,Hungry,Patron,Price,Raining,Reservation,Type,Estimated_Time
0,Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10
1,Yes,No,No,Yes,Full,$,No,No,Thai,30-60
2,No,Yes,No,No,Some,$,No,No,Burger,0-10
3,Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30
4,Yes,No,Yes,No,Full,$$$,No,Yes,French,>60
5,No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10
6,No,Yes,No,No,,$,Yes,No,Burger,0-10
7,No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10
8,No,Yes,Yes,No,Full,$,Yes,No,Burger,>60
9,Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30


In [4]:
replacement_mapping = {
    "010": "short",
    "1030": "medium",
    "3060": "long",
    ">60": "extra_long"
    }

#series.values gives  a ndarray . which got no .replace() function.
def remove_hyphens(strings_list):
    return [s.replace('-', '') for s in strings_list]

class OrdinalBinaryEstimator(BaseEstimator, TransformerMixin):
       
    def __init__(self, dict_mapping):
        self.dict_mapping= dict_mapping
        
    def fit(self, X, y=None):
        return self

    def transform(self, X): 
        # X - dataFrame
        
        for key in self.dict_mapping:
            if key == "Estimated_Time":
                estimated_time_column= X[key]       
                processed_list = remove_hyphens(strings_list=list(estimated_time_column.values))                
                X[key] =  pd.Series(processed_list).replace(replacement_mapping)
                continue
            X[key] = X[key].replace(
                self.dict_mapping[key]
            )   
        return pd.get_dummies(X, columns = ['Patron','Type', 'Estimated_Time']).astype(int)

          
    
def generate_yes_no_dict():
    return {
        "Yes":1,
        "No":0
    }

dict_values = {
    "Alternate": generate_yes_no_dict(),
    "Bar": generate_yes_no_dict(),
    "Friday": generate_yes_no_dict(),
    "Hungry": generate_yes_no_dict(),
    "Price": {
        "$":100,
        "$$":200,
        "$$$":300 },
    
    "Raining":generate_yes_no_dict(),
    "Reservation": generate_yes_no_dict(),
    "Type": generate_yes_no_dict(),
    "Estimated_Time": replacement_mapping
}



#Pipelines are used for transforming some specific single COLUMN or the whole dataframe , else specified by the column transformer for some specific columns to be transformed.The df.dropna() removes rows  that have NAN values in the specified columns.

In [52]:
import  functools

class_series =pd.Series(['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes']).replace(
    generate_yes_no_dict()
)
__ordinal_binary_encoder = Pipeline([
    ('encoder', OrdinalBinaryEstimator(dict_mapping= dict_values))
])

numerical_min_max_scaler = Pipeline([
    ('scaler', MinMaxScaler())
])

display(
    __ordinal_binary_encoder.fit_transform(dataFrame)
)

column_transformer = ColumnTransformer(
    transformers=[        
        ('general_encoder', __ordinal_binary_encoder,col_labels),
        ('price_scaler', numerical_min_max_scaler,['Price']),
    ],
)

lr_pipeline = Pipeline(
    steps=[
    ('preprocessor', column_transformer),
    ('model',LogisticRegression())]
)

X_train, X_test, Y_train, Y_test = train_test_split(
    dataFrame, 
    class_series, 
    test_size=0.2,
)

lr_pipeline.fit(X_train, Y_train)
y_pred = lr_pipeline.predict(X_test)

accuracy = accuracy_score(Y_test, y_pred)

print('Accuracy:', accuracy)



Unnamed: 0,Alternate,Bar,Friday,Hungry,Price,Raining,Reservation,Patron_Full,Patron_None,Patron_Some,Type_Burger,Type_French,Type_Italian,Type_Thai,Estimated_Time_extra_long,Estimated_Time_long,Estimated_Time_medium,Estimated_Time_short
0,1,0,0,1,300,0,1,0,0,1,0,1,0,0,0,0,0,1
1,1,0,0,1,100,0,0,1,0,0,0,0,0,1,0,1,0,0
2,0,1,0,0,100,0,0,0,0,1,1,0,0,0,0,0,0,1
3,1,0,1,1,100,1,0,1,0,0,0,0,0,1,0,0,1,0
4,1,0,1,0,300,0,1,1,0,0,0,1,0,0,1,0,0,0
5,0,1,0,1,200,1,1,0,0,1,0,0,1,0,0,0,0,1
6,0,1,0,0,100,1,0,0,1,0,1,0,0,0,0,0,0,1
7,0,0,0,1,200,1,1,0,0,1,0,0,0,1,0,0,0,1
8,0,1,1,0,100,1,0,1,0,0,1,0,0,0,1,0,0,0
9,1,1,1,1,300,0,1,1,0,0,0,0,1,0,0,0,1,0


9
3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: X has 14 features, but LogisticRegression is expecting 18 features as input.