In [121]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [122]:
from data import restaurant_data
col_labels= ['Alternate', 'Bar', 'Friday', 'Hungry', 'Patron', 'Price', 'Raining', 'Reservation', 'Type', 'Estimated_Time']

dataFrame = pd.DataFrame(restaurant_data).transpose()
df=dataFrame.copy()
df.columns=col_labels
df

Unnamed: 0,Alternate,Bar,Friday,Hungry,Patron,Price,Raining,Reservation,Type,Estimated_Time
0,Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10
1,Yes,No,No,Yes,Full,$,No,No,Thai,30-60
2,No,Yes,No,No,Some,$,No,No,Burger,0-10
3,Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30
4,Yes,No,Yes,No,Full,$$$,No,Yes,French,>60
5,No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10
6,No,Yes,No,No,,$,Yes,No,Burger,0-10
7,No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10
8,No,Yes,Yes,No,Full,$,Yes,No,Burger,>60
9,Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30


In [123]:
replacement_mapping = {
    "010": "short",
    "1030": "medium",
    "3060": "long",
    ">60": "extra_long"
    }

#series.values gives  a ndarray . which got no .replace() function.
def remove_hyphens(strings_list):
    return [s.replace('-', '') for s in strings_list]

class OrdinalBinaryEncoder(BaseEstimator, TransformerMixin):
       
    def __init__(self, dict_values):
        self.dict_values= dict_values
        
    def fit(self, X, y=None):
        return self

    def transform(self, X): 
        # X - dataFrame
        dictionary_values = self.dict_values
        for key in dictionary_values:
            if key == "Estimated_Time":
                estimated_time_column= X[key]               
                processed_list = remove_hyphens(strings_list=list(estimated_time_column.values))                
                X[key] =  pd.Series(processed_list).replace(replacement_mapping)
                continue
            X[key] = X[key].replace(
                dictionary_values[key]
            )   
        return pd.get_dummies(X, columns = ['Patron','Type', 'Estimated_Time']).astype(int)
          
    
def generate_yes_no_dict():
    return {
        "Yes":1,
        "No":0
    }

dict_values = {
    "Alternate": generate_yes_no_dict(),
    "Bar": generate_yes_no_dict(),
    "Friday": generate_yes_no_dict(),
    "Hungry": generate_yes_no_dict(),
    "Price": {
        "$":100,
        "$$":200,
        "$$$":300 },
    
    "Raining":generate_yes_no_dict(),
    "Reservation": generate_yes_no_dict(),
    "Type": generate_yes_no_dict(),
    "Estimated_Time": replacement_mapping
}

encoder = OrdinalBinaryEncoder(dict_values=dict_values)
transformed_data_frame = encoder.fit_transform(df)
transformed_data_frame.sample(5)


Unnamed: 0,Alternate,Bar,Friday,Hungry,Price,Raining,Reservation,Patron_Full,Patron_None,Patron_Some,Type_Burger,Type_French,Type_Italian,Type_Thai,Estimated_Time_extra_long,Estimated_Time_long,Estimated_Time_medium,Estimated_Time_short
8,0,1,1,0,100,1,0,1,0,0,1,0,0,0,1,0,0,0
5,0,1,0,1,200,1,1,0,0,1,0,0,1,0,0,0,0,1
10,0,0,0,0,100,0,0,0,1,0,0,0,0,1,0,0,0,1
2,0,1,0,0,100,0,0,0,0,1,1,0,0,0,0,0,0,1
11,1,1,1,1,300,0,0,1,0,0,1,0,0,0,0,1,0,0


#Pipelines are used for transforming some specific single COLUMN or the whole dataframe , else specified by the column transformer for some specific columns to be transformed.The df.dropna() removes rows  that have NAN values in the specified columns.

In [124]:
classes =pd.Series(['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes']).replace(
    generate_yes_no_dict()
)
print(type(classes))

def predict_churn(customer_details, model):
    new_instance_df = pd.DataFrame([customer_details])
    print(new_instance_df.shape)
    print(new_instance_df)
    return model.predict(new_instance_df)[0]

numerical_min_max_scaler = Pipeline([
    ('scaler', MinMaxScaler())
])
__ordinal_binary_encoder = Pipeline([
    ('encoder', OrdinalBinaryEncoder(dict_values= dict_values))
])

cols = ['Alternate', 'Bar', 'Friday', 'Hungry', 'Patron', 'Price', 'Raining', 'Reservation', 'Type', 'Estimated_Time']

__one_hot_encoder = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

column_transformer = ColumnTransformer(
    transformers=[        
        ('generalEncoder', __ordinal_binary_encoder,[]),
        ('numScaler', numerical_min_max_scaler, ['Price']),
        # ('categoricalTransformer', __one_hot_encoder, [ 'Patron','Type', 'Estimated_Time'])
    ]
)

lr_pipeline = Pipeline(steps=[('preprocessor', column_transformer),
                        ('classifier', LogisticRegression())])


X_train, X_test, Y_train, Y_test = train_test_split(
    transformed_data_frame, 
    classes, 
    test_size=0.2,
    shuffle=True
)
lr_pipeline.fit(X_train, Y_train)
y_pred = lr_pipeline.predict(X_test)
print(y_pred)

accuracy = accuracy_score(Y_test, y_pred)

print('Accuracy:', accuracy)
    

<class 'pandas.core.series.Series'>
[1 1 1]
Accuracy: 0.3333333333333333


In [None]:
#prediction_for_one_customer = predict_churn(instance, lr_pipeline)
# lr_pipeline.named_steps["logistic_regression"].predict(instance)
# print("sth very cool here :: ", lr_pipeline.named_steps["logistic_regression"].predict(instance))
# print("Prediction for one Customer: {}".format(prediction_for_one_customer))
