In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Load the dataset with appropriate encoding
file_path = "C:/Users/99906068/OneDrive - L&T Construction/Desktop/British Airways/customer_booking.csv"
data = pd.read_csv(file_path, encoding='latin1')

# Display basic information about the dataset
print(data.info())
print(data.describe())
print(data.head())

# Create binary target variable
threshold = 7  # Define an appropriate threshold for classification
data['purchase_lead_binary'] = (data['purchase_lead'] > threshold).astype(int)

#Creating a binary target variable:
#threshold = 7: Sets a threshold value for classification.
#data['purchase_lead_binary']: Creates a new column that is 1 if purchase_lead is greater than 7, otherwise 0.

#Creates a new binary column purchase_lead_binary
#where values greater than 7 in purchase_lead are marked as 1 (positive) and the rest as 0 (negative).

# Identifying features and target
X = data.drop(columns=['purchase_lead', 'purchase_lead_binary'])  # Features
y = data['purchase_lead_binary']  # Target column

#Identifying features and target:
#X: Contains all columns except purchase_lead and the new binary column.
#y: The target column, purchase_lead_binary.

#Separates the features (X) from the target variable (y).
#Here, purchase_lead and purchase_lead_binary columns are dropped from X.


# Encoding categorical variables
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column].astype(str))
    label_encoders[column] = le
    
#Encoding categorical variables:
#LabelEncoder(): Converts categorical text data into numbers.
#Iterates through object-type columns in X to encode them.

#Encodes categorical variables into numeric format using LabelEncoder.

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Splits the dataset into training (80%) and testing (20%) sets.

# Preprocessing pipelines for numerical data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns # Selects numerical columns.

numeric_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])               #A pipeline that fills missing values and scales the data.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])  #Applies the transformations to numerical features.

#Creates a preprocessing pipeline for numerical data that fills missing values with the mean and scales the data.

# Preprocess the data
X_train = preprocessor.fit_transform(X_train) #Fits and transforms the training data.
X_test = preprocessor.transform(X_test) #Transforms the test data using the same parameters.

#Applies the preprocessing pipeline to both the training and testing sets.

# Train the model
model = RandomForestClassifier(random_state=42) #Initializes the random forest model.
model.fit(X_train, y_train) # Trains the model on the training data.
#Trains a RandomForestClassifier model on the training data.


# Predict on the test set
y_pred = model.predict(X_test) #Predicts the target for test data.

#Uses the trained model to predict the target variable on the test set.


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
report = classification_report(y_test, y_pred) #Generates a detailed report of model performance.

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(report)

# Feature importance
importances = model.feature_importances_   #Retrieves importance scores for features.
feature_names = numeric_features
indices = np.argsort(importances)[::-1]  #Sorts features by importance.

#Extracts feature importance from the model and sorts them in descending order.

# Save results to PDF
output_file = "visualization_results.pdf"
with PdfPages(output_file) as pdf:
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(range(X_train.shape[1]), importances[indices], align="center")  # Bar chart showing which features are most important.
    plt.yticks(range(X_train.shape[1]), feature_names[indices])
    plt.gca().invert_yaxis()  # Reverse the order of the features for better readability
    plt.title('Feature Importances')
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

    # Plot evaluation metrics
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall
    }

    plt.figure(figsize=(10, 6))
    plt.bar(metrics.keys(), metrics.values(), color=['blue', 'orange', 'green'])
    plt.title('Evaluation Metrics')
    plt.xlabel('Metric')
    plt.ylabel('Value')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

print(f"Visualization results saved successfully to {output_file}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.3+ 