## Assignment

### prediction of total_bill in tips dataset

In [1]:
import seaborn as sns
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
df.total_bill

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [4]:
## independent and dependent features
X=df.drop(labels=['total_bill'],axis=1)
y=df.total_bill

In [5]:
X.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.5,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4


In [6]:
y

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20, random_state=42)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #to handle missing values 
from sklearn.preprocessing import StandardScaler #feature scaling
from sklearn.preprocessing import OneHotEncoder #categorical to numerical
from sklearn.compose import ColumnTransformer

In [10]:
categorical_cols=df[['sex','smoker','day','time']]
numerical_cols=df[['tip','size']]

In [11]:
categorical_cols

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner
...,...,...,...,...
239,Male,No,Sat,Dinner
240,Female,Yes,Sat,Dinner
241,Male,Yes,Sat,Dinner
242,Male,No,Sat,Dinner


In [12]:
##feature engineering automation
##numerical pipeline for i)handling missing values ii)feature scaling

num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),#missing values handle
        ('scalar',StandardScaler())#feature scaling
    ]
)


## Categorical pipeling

cat_pipeline=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),#missing values
        ('Onehotencoder',OneHotEncoder()) #categorical features to numerical
    ]
)

In [13]:
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols.columns),
    ('cat_pipeline',cat_pipeline,categorical_cols.columns)
])

In [14]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [15]:
X_train

array([[-0.2580329 , -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.74211442, -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.6399734 , -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.46472887, -0.61214068,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.32426806, -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.41237773,  0.45363997,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
#Model training automation

models={
    'Random Forest Regressor':RandomForestRegressor()
}

In [18]:
from sklearn.metrics import r2_score

In [22]:
def evaluate_model(X_train,X_test,y_train,y_test,models):

    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        #train model
        model.fit(X_train,y_train)

        #predict testing data
        y_test_pred=model.predict(X_test)
        print(y_test_pred)

        #get accuracy scores for test data

        test_model_score=r2_score(y_test,y_test_pred)
        report[list(models.keys())[i]]=test_model_score

    return report

In [23]:
evaluate_model(X_train,X_test,y_train,y_test,models)

[18.30245    13.73502    20.94088    28.05694    12.78519448 14.29162833
 15.515325   15.51753833 20.7687     21.09466    18.7268     13.1081
 10.37997571 14.29162833 11.35756976 13.76405    21.096      18.5681
 14.434205   26.936      19.2084     21.1563     19.6311     13.1081
 23.6394     15.8011     14.20578    24.6355     20.94088    24.4356
 22.019      13.2748     18.5164     18.9733     21.3972     20.6833
 12.746825   29.6763     18.54247619 14.59659167 12.0321     12.01830206
 15.92623643 14.40788    14.511245   10.87423333 18.47190238 17.0685
 11.08031667]


{'Random Forest Regressor': 0.5031233757228053}

In [21]:
classifier=RandomForestRegressor()