## Problem statement : Extracting features from the data set and churning out outliers to see the impact on the machine learning model

### Importing Libraries

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data ingestion

In [64]:
data=pd.read_csv('travel.csv')

In [65]:
data

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,34,No,Middle Income,6,No,Yes,0
1,34,Yes,Low Income,5,Yes,No,1
2,37,No,Middle Income,3,Yes,No,0
3,30,No,Middle Income,2,No,No,0
4,30,No,Low Income,1,No,No,0
...,...,...,...,...,...,...,...
949,31,Yes,Low Income,1,No,No,0
950,30,No,Middle Income,5,No,Yes,0
951,37,No,Middle Income,4,No,No,0
952,30,No,Low Income,1,Yes,Yes,0


### Data Cleaning 

### Null Value information

In [66]:
data.isnull().sum()

Age                           0
FrequentFlyer                 0
AnnualIncomeClass             0
ServicesOpted                 0
AccountSyncedToSocialMedia    0
BookedHotelOrNot              0
Target                        0
dtype: int64

### Data information

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age                         954 non-null    int64 
 1   FrequentFlyer               954 non-null    object
 2   AnnualIncomeClass           954 non-null    object
 3   ServicesOpted               954 non-null    int64 
 4   AccountSyncedToSocialMedia  954 non-null    object
 5   BookedHotelOrNot            954 non-null    object
 6   Target                      954 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 52.3+ KB


In [68]:
data

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,34,No,Middle Income,6,No,Yes,0
1,34,Yes,Low Income,5,Yes,No,1
2,37,No,Middle Income,3,Yes,No,0
3,30,No,Middle Income,2,No,No,0
4,30,No,Low Income,1,No,No,0
...,...,...,...,...,...,...,...
949,31,Yes,Low Income,1,No,No,0
950,30,No,Middle Income,5,No,Yes,0
951,37,No,Middle Income,4,No,No,0
952,30,No,Low Income,1,Yes,Yes,0


### Categorical encoding 

- Dummy encoding shall be performed as there is no internal hierarchy in the elements in the respective colums

In [69]:
categorical_features=['FrequentFlyer','AnnualIncomeClass','AccountSyncedToSocialMedia','BookedHotelOrNot']

In [70]:
data=pd.get_dummies(data=data, columns=categorical_features,drop_first=True)

In [71]:
data

Unnamed: 0,Age,ServicesOpted,Target,FrequentFlyer_No Record,FrequentFlyer_Yes,AnnualIncomeClass_Low Income,AnnualIncomeClass_Middle Income,AccountSyncedToSocialMedia_Yes,BookedHotelOrNot_Yes
0,34,6,0,0,0,0,1,0,1
1,34,5,1,0,1,1,0,1,0
2,37,3,0,0,0,0,1,1,0
3,30,2,0,0,0,0,1,0,0
4,30,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
949,31,1,0,0,1,1,0,0,0
950,30,5,0,0,0,0,1,0,1
951,37,4,0,0,0,0,1,0,0
952,30,1,0,0,0,1,0,1,1


In [72]:
# importinng necessaary libraires

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [73]:
## splitting in the data in the feature and target set

x=data.drop('Target',axis=1)

In [74]:
x

Unnamed: 0,Age,ServicesOpted,FrequentFlyer_No Record,FrequentFlyer_Yes,AnnualIncomeClass_Low Income,AnnualIncomeClass_Middle Income,AccountSyncedToSocialMedia_Yes,BookedHotelOrNot_Yes
0,34,6,0,0,0,1,0,1
1,34,5,0,1,1,0,1,0
2,37,3,0,0,0,1,1,0
3,30,2,0,0,0,1,0,0
4,30,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
949,31,1,0,1,1,0,0,0
950,30,5,0,0,0,1,0,1
951,37,4,0,0,0,1,0,0
952,30,1,0,0,1,0,1,1


In [75]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Age                              954 non-null    int64
 1   ServicesOpted                    954 non-null    int64
 2   FrequentFlyer_No Record          954 non-null    uint8
 3   FrequentFlyer_Yes                954 non-null    uint8
 4   AnnualIncomeClass_Low Income     954 non-null    uint8
 5   AnnualIncomeClass_Middle Income  954 non-null    uint8
 6   AccountSyncedToSocialMedia_Yes   954 non-null    uint8
 7   BookedHotelOrNot_Yes             954 non-null    uint8
dtypes: int64(2), uint8(6)
memory usage: 20.6 KB


In [76]:
y=data['Target']

In [77]:
y

0      0
1      1
2      0
3      0
4      0
      ..
949    0
950    0
951    0
952    0
953    0
Name: Target, Length: 954, dtype: int64

### Data Scaling

- standard scaling will be used as there is no business knowledge about the lower and upper limit of metric columns also the metric columns are of different measurement units 

In [78]:
sc=StandardScaler()

In [79]:
x_sc=sc.fit_transform(x) # why fit_tranform() !!!

In [81]:
x_sc

array([[ 0.56690365,  2.21932995, -0.25906388, ...,  1.15434758,
        -0.77849894,  1.2344268 ],
       [ 0.56690365,  1.59642887, -0.25906388, ..., -0.8662902 ,
         1.28452326, -0.81009259],
       [ 1.46628183,  0.35062671, -0.25906388, ...,  1.15434758,
         1.28452326, -0.81009259],
       ...,
       [ 1.46628183,  0.97352779, -0.25906388, ...,  1.15434758,
        -0.77849894, -0.81009259],
       [-0.63226726, -0.89517545, -0.25906388, ..., -0.8662902 ,
         1.28452326,  1.2344268 ],
       [-0.33247453, -0.89517545, -0.25906388, ..., -0.8662902 ,
        -0.77849894, -0.81009259]])

In [82]:
## Converting the scaled set into data frame

x_sc_df=pd.DataFrame(data=x_sc, columns=x.columns)

In [83]:
x_sc_df

Unnamed: 0,Age,ServicesOpted,FrequentFlyer_No Record,FrequentFlyer_Yes,AnnualIncomeClass_Low Income,AnnualIncomeClass_Middle Income,AccountSyncedToSocialMedia_Yes,BookedHotelOrNot_Yes
0,0.566904,2.219330,-0.259064,-0.654327,-0.824365,1.154348,-0.778499,1.234427
1,0.566904,1.596429,-0.259064,1.528288,1.213055,-0.866290,1.284523,-0.810093
2,1.466282,0.350627,-0.259064,-0.654327,-0.824365,1.154348,1.284523,-0.810093
3,-0.632267,-0.272274,-0.259064,-0.654327,-0.824365,1.154348,-0.778499,-0.810093
4,-0.632267,-0.895175,-0.259064,-0.654327,1.213055,-0.866290,-0.778499,-0.810093
...,...,...,...,...,...,...,...,...
949,-0.332475,-0.895175,-0.259064,1.528288,1.213055,-0.866290,-0.778499,-0.810093
950,-0.632267,1.596429,-0.259064,-0.654327,-0.824365,1.154348,-0.778499,1.234427
951,1.466282,0.973528,-0.259064,-0.654327,-0.824365,1.154348,-0.778499,-0.810093
952,-0.632267,-0.895175,-0.259064,-0.654327,1.213055,-0.866290,1.284523,1.234427


### Implementing the ML pipeline with feature extraction 


In [84]:
xtrain , xtest , ytrain , ytest = train_test_split(x_sc_df, y, test_size=0.2, random_state=10)

In [101]:
def modelling(xtrain , xtest, ytrain ,ytest):
    
    model1=SVC()
    model1=model1.fit(xtrain , ytrain)
    pred1=model1.predict(xtest)
    print('the  perfromance report for SVC is :\n\n', classification_report(ytest, pred1))
    
    model2=RandomForestClassifier()
    model2=model2.fit(xtrain , ytrain)
    pred2=model2.predict(xtest)
    print('the  perfromance report for random forest model is :\n\n', classification_report(ytest, pred2))
    
    model3=DecisionTreeClassifier()
    model3=model3.fit(xtrain , ytrain)
    pred3=model3.predict(xtest)
    print('the  perfromance report for decision tree classifier is :\n\n', classification_report(ytest, pred3))
    
    # getting feature scores 
    
    features_scores=model3.feature_importances_
    
    feature_importance=pd.DataFrame({'model_scores':features_scores},index=xtrain.columns)
    
    feature_importance.sort_values(by='model_scores',ascending=False,inplace=True)
    
    print('Importance features wrt decision tree:\n\n',feature_importance)
    
    model4=KNeighborsClassifier()
    model4=model4.fit(xtrain , ytrain)
    pred4=model4.predict(xtest)
    print('the  perfromance report for KNN is :\n\n', classification_report(ytest, pred4))

In [102]:
modelling(xtrain , xtest, ytrain ,ytest)

the  perfromance report for SVC is :

               precision    recall  f1-score   support

           0       0.80      0.92      0.85       130
           1       0.74      0.51      0.60        61

    accuracy                           0.79       191
   macro avg       0.77      0.71      0.73       191
weighted avg       0.78      0.79      0.77       191

the  perfromance report for random forest model is :

               precision    recall  f1-score   support

           0       0.87      0.91      0.89       130
           1       0.78      0.70      0.74        61

    accuracy                           0.84       191
   macro avg       0.82      0.81      0.81       191
weighted avg       0.84      0.84      0.84       191

the  perfromance report for decision tree classifier is :

               precision    recall  f1-score   support

           0       0.86      0.93      0.90       130
           1       0.82      0.69      0.75        61

    accuracy                

### Significant features churning

- the significant features as per the above analysis is : ServicesOpted, Age , FrequentFlyer_Yes, AnnualIncomeClass_Low Income,AccountSyncedToSocialMedia_Yes

In [105]:
significant_data=x_sc_df[['ServicesOpted','Age','FrequentFlyer_Yes','AnnualIncomeClass_Low Income','AccountSyncedToSocialMedia_Yes']]

In [106]:
significant_data

Unnamed: 0,ServicesOpted,Age,FrequentFlyer_Yes,AnnualIncomeClass_Low Income,AccountSyncedToSocialMedia_Yes
0,2.219330,0.566904,-0.654327,-0.824365,-0.778499
1,1.596429,0.566904,1.528288,1.213055,1.284523
2,0.350627,1.466282,-0.654327,-0.824365,1.284523
3,-0.272274,-0.632267,-0.654327,-0.824365,-0.778499
4,-0.895175,-0.632267,-0.654327,1.213055,-0.778499
...,...,...,...,...,...
949,-0.895175,-0.332475,1.528288,1.213055,-0.778499
950,1.596429,-0.632267,-0.654327,-0.824365,-0.778499
951,0.973528,1.466282,-0.654327,-0.824365,-0.778499
952,-0.895175,-0.632267,-0.654327,1.213055,1.284523


In [110]:
xtrain , xtest , ytrain , ytest = train_test_split(significant_data, y, test_size=0.3, random_state=10)

In [111]:
final_model=RandomForestClassifier()
final_model=final_model.fit(xtrain , ytrain)
pred=final_model.predict(xtest)
print('the  perfromance report for decision tree classifier is :\n\n', classification_report(ytest, pred))

the  perfromance report for decision tree classifier is :

               precision    recall  f1-score   support

           0       0.87      0.93      0.90       205
           1       0.78      0.65      0.71        82

    accuracy                           0.85       287
   macro avg       0.82      0.79      0.80       287
weighted avg       0.84      0.85      0.84       287



### Comparing the ML perfromance with and without feature selection:

- we see that post selecting the significant features and running the ML model again we have see a small amount of improvement in the recall , f1-score 