# Problems
We want to make predict about potential customers for a business company

## Import librabries

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the dataset

In [7]:
df = pd.read_csv(r'C:\Users\Admin\Desktop\ML and DL\ML\Random_Forest\Travel.csv')
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport

## Preprocessing
Data cleaning

In [9]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [11]:
df['TypeofContact'].value_counts()

Self Enquiry       3444
Company Invited    1419
Name: TypeofContact, dtype: int64

In [12]:
df['Gender'].value_counts()

Male       2916
Female     1817
Fe Male     155
Name: Gender, dtype: int64

In [16]:
df['Gender'] = df['Gender'].str.replace('Fe Male','Female')
df['Gender'].value_counts()

Male      2916
Female    1972
Name: Gender, dtype: int64

In [19]:
df['MaritalStatus'].value_counts()

Married      2340
Divorced      950
Single        916
Unmarried     682
Name: MaritalStatus, dtype: int64

In [20]:
df['MaritalStatus'] = df['MaritalStatus'].str.replace('Single','Unmarried')
df['MaritalStatus'].value_counts()

Married      2340
Unmarried    1598
Divorced      950
Name: MaritalStatus, dtype: int64

In [23]:
#Checking for missing values
features_with_na = [features for features in df.columns if df[features].isnull().sum()>=1]
features_with_na

['Age',
 'TypeofContact',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

### Fill the missing values

In [28]:
num_features_with_na = [features for features in features_with_na if df[features].dtypes != 'O']
cat_features_with_na = [features for features in features_with_na if df[features].dtypes == 'O']
print(num_features_with_na)
print(cat_features_with_na)

['Age', 'DurationOfPitch', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'NumberOfChildrenVisiting', 'MonthlyIncome']
['TypeofContact']


In [29]:
for feature in num_features_with_na:
    df[feature]= df[feature].fillna(df[feature].median())

for feature in cat_features_with_na:
    df[feature]= df[feature].fillna(df[feature].mode()[0])

In [30]:
df.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [31]:
df.drop(['CustomerID'],axis=1,inplace=True)

## Feature Extraction

In [34]:
# As we see in the Dataset, the 'NumberOfPersonVisiting' column and the 'NumberOfChildrenVisiting' column are just total numbers of people visiting
# So, we can create a new column 'TotalMembers' which will be the sum of these two columns
df['TotalMembers'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(columns = ['NumberOfPersonVisiting','NumberOfChildrenVisiting'],axis=1,inplace=True)

In [35]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalMembers
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Unmarried,7.0,1,3,0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [38]:
num_feature = [feature for feature in df.columns if df[feature].dtypes != 'O']
cat_feature = [feature for feature in df.columns if df[feature].dtypes == 'O']
print(num_feature)
print(cat_feature)

['ProdTaken', 'Age', 'CityTier', 'DurationOfPitch', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar', 'MonthlyIncome', 'TotalMembers']
['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation']


In [39]:
discrete_feature = [feature for feature in num_feature if len(df[feature].unique())<=25]
continuous_features=[feature for feature in num_feature if feature not in discrete_feature]
print(discrete_feature)
print(continuous_features)

['ProdTaken', 'CityTier', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar', 'TotalMembers']
['Age', 'DurationOfPitch', 'MonthlyIncome']


In [40]:
df.to_csv(r'C:\Users\Admin\Desktop\ML and DL\ML\Random_Forest\Travel_cleaned.csv',index=False)

## Training 

In [48]:
from sklearn.model_selection import train_test_split
X = df.drop(columns = ['ProdTaken'],axis=1)
y = df['ProdTaken']
y.value_counts()

0    3968
1     920
Name: ProdTaken, dtype: int64

In [51]:
X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size=0.2,random_state=32)
X_train.shape,X_test.shape

((3910, 17), (978, 17))

In [52]:
catFeatures = X.select_dtypes(include='object').columns
numFeatures = X.select_dtypes(exclude='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
ohe_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",ohe_transformer, catFeatures),
        ("StandardScaler",numeric_transformer,numFeatures)
    ]
)

In [53]:
X_train = preprocessor.fit_transform(X_train)

In [54]:
X_test = preprocessor.transform(X_test) 

In [55]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.461225,-0.637426,0.282725,0.522394,-0.672851,1.57045,-0.047642,0.774491,-0.6288,-0.771471
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.717379,-0.75671,0.282725,0.522394,-0.108776,-0.63676,0.68575,-1.291171,-0.470271,1.339904
2,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,-0.717379,-0.398856,-0.712285,1.771668,-0.108776,-0.63676,1.419141,0.774491,-0.162281,0.636113
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,-0.717379,-0.279572,0.282725,1.771668,1.583451,1.57045,-0.047642,0.774491,1.58457,0.636113
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.461225,1.986838,-0.712285,-0.726879,-0.108776,-0.63676,-0.047642,0.774491,1.468176,-0.067679


### Model Training

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [58]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [62]:
for name, model in models.items():
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Performance on Train set
    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred)
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
    # Performance on Test set
    model_test_accuracy = accuracy_score(y_test,y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred)
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred)
    
    print(name)
    print("Performance on Training set:")
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
    
    print('-----------------------------------------------')
    
    print("Performance on Testing set:")
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
    
    print('================================================== \n')

Logistic Regression
Performance on Training set:
- Accuracy: 0.8465
- F1 score: 0.3976
- Precision: 0.6947
- Recall: 0.2785
- Roc Auc Score: 0.6256
-----------------------------------------------
Performance on Testing set:
- Accuracy: 0.8333
- F1 score: 0.4399
- Precision: 0.7805
- Recall: 0.3062
- Roc Auc Score: 0.6414

Decision Tree
Performance on Training set:
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
-----------------------------------------------
Performance on Testing set:
- Accuracy: 0.9070
- F1 score: 0.7742
- Precision: 0.8041
- Recall: 0.7464
- Roc Auc Score: 0.8485

Random Forest
Performance on Training set:
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
-----------------------------------------------
Performance on Testing set:
- Accuracy: 0.9223
- F1 score: 0.7803
- Precision: 0.9854
- Recall: 0.6459
- Roc Auc Score: 0.8217

