In [1]:
#Shinkansen Travel Experience
#The goal of the problem is to predict whether a passenger was satisfied or not considering his/her overall experience of traveling on the Shinkansen Bullet Train.

#Dataset: 
#The problem consists of 2 separate datasets: Travel data & Survey data. Travel data has information related to passengers and attributes related to the Shinkansen train, in which they traveled. The survey data is aggregated data of surveys indicating the post-service experience. You are expected to treat both these datasets as raw data and perform any necessary data cleaning/validation steps as required.

#The data has been split into two groups and provided in the Dataset folder. The folder contains both train and test data separately.

#Train_Data
#Test_Data

#Target Variable: Overall_Experience (1 represents ‘satisfied’, and 0 represents ‘not satisfied’)

#The training set can be used to build your machine-learning model. The training set has labels for the target column - Overall_Experience.

#The testing set should be used to see how well your model performs on unseen data. For the test set, it is expected to predict the ‘Overall_Experience’ level for each participant.

#Data Dictionary:
#All the data is self-explanatory. The survey levels are explained in the Data Dictionary file.

#Submission File Format: You will need to submit a CSV file with exactly 35,602 entries plus a header row. The file should have exactly two columns

#ID
#Overall_Experience (contains 0 & 1 values, 1 represents ‘Satisfied’, and 0 represents ‘Not Satisfied’)

#Evaluation Criteria:

#Accuracy Score: The evaluation metric is simply the percentage of predictions made by the model that turned out to be correct. This is also called the accuracy of the model. It will be calculated as the total number of correct predictions (True Positives + True Negatives) divided by the total number of observations in the dataset.
 
#In other words, the best possible accuracy is 100% (or 1), and the worst possible accuracy is 0%.

In [2]:
!pip install pandas numpy scikit-learn matplotlib seaborn xgboost openpyxl 

# Explanation:
#Pandas will be used to load and preprocess your CSV files.
#Scikit-learn will allow you to train a machine learning model (e.g., RandomForest, XGBoost, etc.).
#Matplotlib and Seaborn are for visualizing your data and the model performance.
#XGBoost (or LightGBM) is useful for training high-performance models, especially if you have large datasets.
#Openpyxl is for working with Excel files, such as reading the data dictionary.





In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score



In [4]:
#load libraries

# Load the datasets
travel_train = pd.read_csv('Traveldata_train.csv')
survey_train = pd.read_csv('Surveydata_train.csv')

travel_test = pd.read_csv('Traveldata_test.csv')
survey_test = pd.read_csv('Surveydata_test.csv')

# Merge Travel and Survey data based on ID
train_data = pd.merge(travel_train, survey_train, on='ID', how='inner')
test_data = pd.merge(travel_test, survey_test, on='ID', how='inner')


In [5]:
#Observations:

#Training Travel Data Summary:
#The Travel data includes information related to the passengers and their travel details on the Shinkansen Bullet Train.

#Columns:

#ID: Unique identifier for each passenger (numeric).
#Gender: Gender of the passenger (categorical: Male, Female).
#Customer_Type: Type of customer (categorical: Loyal Customer).
#Age: Age of the passenger (numeric).
#Type_Travel: Type of travel (categorical: Business Travel, Personal Travel, Business Travel).
#Travel_Class: The class of travel (categorical: Business, Eco).
#Travel_Distance: Distance traveled by the passenger (numeric, in kilometers).
#Departure_Delay_in_Mins: Delay at departure in minutes (numeric).
#Arrival_Delay_in_Mins: Delay at arrival in minutes (numeric).
#Data Insights:

#The Age column ranges from 43 to 52 years, indicating an adult demographic.
#The Travel_Distance ranges from 2200 km to 272 km, indicating passengers on both long and short-distance trips.
#Departure_Delay_in_Mins and Arrival_Delay_in_Mins indicate varying levels of delays, with some passengers experiencing long delays (e.g., 77 minutes departure delay and 119 minutes arrival delay for one record).
#The Travel_Class column has two primary categories: Business and Eco, indicating different class types available.
#Training Survey Data Summary:
#The Survey data contains post-service experience feedback, which includes the passengers' evaluation of various aspects of the service they experienced.

#Columns:

#ID: Unique identifier for each passenger (numeric).
#Overall_Experience: Overall experience rating (binary: 1 for 'Satisfied', 0 for 'Not Satisfied').
#Seat_Comfort: Rating of seat comfort (categorical: Needs Improvement, Poor, Acceptable).
#Seat_Class: The class of seat (categorical: Green Car, Ordinary).
#Arrival_Time_Convenient: Evaluation of arrival time convenience (categorical: Excellent, Needs Improvement, Acceptable).
#Catering: Catering service evaluation (categorical: Excellent, Poor, Acceptable).
#Platform_Location: Evaluation of platform location convenience (categorical: Very Convenient, Needs Improvement, Manageable).
#Onboard_Wifi_Service: Evaluation of onboard Wi-Fi service (categorical: Good, Needs Improvement, Acceptable).
#Onboard_Entertainment: Evaluation of onboard entertainment (categorical: Good, Needs Improvement, Acceptable).
#Online_Support: Evaluation of online support (categorical: Excellent, Good, Acceptable).
#Ease_of_Online_Booking: Evaluation of online booking ease (categorical: Excellent, Good, Needs Improvement).
#Onboard_Service: Evaluation of onboard service (categorical: Excellent, Acceptable, Good).
#Legroom: Evaluation of legroom (categorical: Excellent, Acceptable, Needs Improvement).
#Baggage_Handling: Evaluation of baggage handling (categorical: Excellent, Good, Needs Improvement, Poor).
#CheckIn_Service: Evaluation of check-in service (categorical: Excellent, Needs Improvement, Good).
#Cleanliness: Evaluation of cleanliness (categorical: Excellent, Good, Needs Improvement).
#Online_Boarding: Evaluation of online boarding experience (categorical: Excellent, Good, Poor, Acceptable).
#Data Insights:

#Overall_Experience has a binary value of 0 (Not Satisfied) or 1 (Satisfied), which will be the target variable for the model.
#A mix of categories appears across various feedback columns. For example:
#Seat_Comfort has values like Needs Improvement, Poor, and Acceptable.
#Catering ratings range from Excellent to Poor.
#Other columns like Legroom, Baggage_Handling, and Cleanliness provide insights into the quality of services.
#Online support and Onboard Wi-Fi Service are key factors affecting the satisfaction score, with many records reporting "Needs Improvement" or "Acceptable" evaluations.
#Key Observations:
#Target Variable (Overall_Experience):

#The target variable in the Survey Data indicates whether the passenger was satisfied (1) or not (0). This is what we will predict based on the available features.
#Demographics and Service Features:

#The Travel Data gives us insights into the demographic profile (age, gender, travel type) and travel-related features (distance, delays).
# Survey Data captures the feedback on various aspects of the service (comfort, Wi-Fi, cleanliness, etc.).
#It is expected that factors like catering, seat comfort, online booking, and wifi service might strongly influence satisfaction.


In [6]:
# Data Preprocessing: Handle Missing Values

# Identify numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=[np.number]).columns
categorical_cols = train_data.select_dtypes(exclude=[np.number]).columns

# Fill missing values for numeric columns with mean
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].mean())
test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())

# Fill missing values for categorical columns with the mode (most frequent value)
train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])
test_data[categorical_cols] = test_data[categorical_cols].fillna(test_data[categorical_cols].mode().iloc[0])

# Encoding categorical variables
label_encoder = LabelEncoder()
for col in categorical_cols:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])

# Separate features (X) and target (y) in the training set
X_train = train_data.drop(columns=['ID', 'Overall_Experience'])  # Drop target column in training
y_train = train_data['Overall_Experience']  # Target column in training

# Separate features (X) in the test set (do not drop 'Overall_Experience' because it's not present in the test set)
X_test = test_data.drop(columns=['ID'])  # Only drop 'ID' column in test set

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest or XGBoost model (XGBoost recommended for better performance)
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Generate a DataFrame for submission with required format
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Ensure ID column is present from the test set
    'Overall_Experience': y_pred
})

# Save the predictions as a CSV file with the correct format
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


KeyError: "['Overall_Experience'] not in index"

In [None]:
#applying scaling to travel data

from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# List of numerical columns in Travel Data
numerical_columns_train = ['Age', 'Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins']

# Apply scaling to numerical columns in Travel Data
train_data[numerical_columns_train] = scaler.fit_transform(train_data[numerical_columns_train])

# Check the scaled data in Travel Data
print("Scaled Travel Data:")
print(train_data.head())



In [None]:
#applying scaling to survey data

from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# List of numerical columns in Survey Data (excluding 'ID' and 'Overall_Experience')
numerical_columns_survey = ['Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering']  # Add other numerical feedback columns if needed

# Apply scaling to these numerical columns in Survey Data
survey_train[numerical_columns_survey] = scaler.fit_transform(survey_train[numerical_columns_survey])

# Check the scaled data in Survey Data
print("Scaled Survey Data:")
print(survey_train.head())




In [None]:
#Merging the Travel and Survey Data

# Merge Travel and Survey data on 'ID'
train_merged = pd.merge(train_data, survey_train, on='ID')

# Drop one of the 'Overall_Experience' columns if they are duplicates
train_merged = train_merged.loc[:,~train_merged.columns.duplicated()]

# Display the merged data
print(train_merged.head())


In [None]:
#Define Target Variable and Features

# Define features and target
X_train = train_merged.drop(columns=['ID', 'Overall_Experience'])  # Drop 'ID' and target 'Overall_Experience'
y_train = train_merged['Overall_Experience']  # Target column

# Check the shapes of the features and target
print(f"Features shape: {X_train.shape}")
print(f"Target shape: {y_train.shape}")


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Ensure the merged train dataset has 'Overall_Experience' as the target column
# It seems there were duplicate or renamed columns like 'Overall_Experience_x' and 'Overall_Experience_y'

train_merged = train_merged.rename(columns={'Overall_Experience_x': 'Overall_Experience'})
train_merged.drop(columns=['Overall_Experience_y'], inplace=True)  # Drop if redundant

# Define the target variable and features
X_train = train_merged.drop(columns=['ID', 'Overall_Experience'])  # Drop 'ID' and target 'Overall_Experience'
y_train = train_merged['Overall_Experience']  # Target column

# Check if 'Overall_Experience' is present and correctly handled
print(f"Target column (Overall_Experience) count: {y_train.isnull().sum()}")

# Impute missing values for both numeric and categorical columns in the training data
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
categorical_cols = X_train.select_dtypes(exclude=[np.number]).columns

# Create imputers for numeric and categorical columns
numeric_imputer = SimpleImputer(strategy='median')  # For numeric columns
categorical_imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns

# Apply imputer for numeric columns in the training data
X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])

# Apply imputer for categorical columns in the training data
X_train[categorical_cols] = categorical_imputer.fit_transform(X_train[categorical_cols])

# Check if missing values are handled in the training set
print(f"Missing values in Training Data after imputation:\n{X_train.isnull().sum()}")

# Apply imputers for the test data (ensure the same columns are used)
test_data[numeric_cols] = numeric_imputer.transform(test_data[numeric_cols])
test_data[categorical_cols] = categorical_imputer.transform(test_data[categorical_cols])

# Check if missing values are handled in the test set
print(f"Missing values in Test Data after imputation:\n{test_data.isnull().sum()}")

# Apply OneHotEncoder for categorical columns with nominal categories (e.g., service ratings)
categorical_columns = ['Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment',
                       'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom',
                       'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding']

# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')  # Drop first to avoid multicollinearity

# Create a ColumnTransformer to apply one-hot encoding to categorical columns
preprocessor = ColumnTransformer(
    transformers=[('cat', one_hot_encoder, categorical_columns)],
    remainder='passthrough'  # Keep the rest of the columns as is
)

# Apply the transformations
X_train_encoded = preprocessor.fit_transform(X_train)

# Now the data is encoded and imputed
print(f"Shape of encoded training data: {X_train_encoded.shape}")

# Train XGBoost Model
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Train the model
xgb_model.fit(X_train_encoded, y_train)

# Make predictions on the training data
y_pred_train = xgb_model.predict(X_train_encoded)

# Evaluate the model using accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy (XGBoost): {train_accuracy:.4f}")

# Prepare and preprocess the test data in the same way as training data
X_test = test_data.drop(columns=['ID'])

# Apply the same imputers to the test data
X_test[numeric_cols] = numeric_imputer.transform(X_test[numeric_cols])
X_test[categorical_cols] = categorical_imputer.transform(X_test[categorical_cols])

# Encode the categorical columns in the test data
X_test_encoded = preprocessor.transform(X_test)

# Make predictions on the test data
y_pred_test = xgb_model.predict(X_test_encoded)

# Evaluate the model on the test set if y_test is available
# For now, we are only predicting, so we can save the results
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'Overall_Experience': y_pred_test
})

# Save the submission file as CSV
submission.to_csv('xgboost_submission.csv', index=False)
print("Submission file created successfully!")






In [None]:
# Hyperparameter Tuning (if needed)

from sklearn.model_selection import GridSearchCV

# Define parameter grid for tuning XGBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize GridSearchCV with XGBoost
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy')

# Fit the model to training data
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print("Best Hyperparameters (XGBoost):", grid_search.best_params_)

# Train with the best model
best_xgb_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_best = best_xgb_model.predict(X_train)

# Evaluate with the best model
best_train_accuracy = accuracy_score(y_train, y_pred_best)
print(f"Best Training Accuracy: {best_train_accuracy:.4f}")




In [None]:
#Evaluate the Model on the Test Data

# Merge the test data (apply the same preprocessing steps as training data)
test_data_merged = pd.merge(test_data, survey_test, on='ID')

# Apply similar preprocessing steps to the test data
test_data_merged[numerical_columns] = scaler.transform(test_data_merged[numerical_columns])
test_data_merged[categorical_cols] = label_encoder.transform(test_data_merged[categorical_cols])

# Define X_test (drop 'ID' and 'Overall_Experience' from test data)
X_test = test_data_merged.drop(columns=['ID', 'Overall_Experience'])

# Predict on the test set
y_pred_test = best_xgb_model.predict(X_test)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': test_data_merged['ID'],  # Use the ID column from the test set
    'Overall_Experience': y_pred_test  # Predicted values for Overall_Experience
})

# Save the submission file as CSV
submission.to_csv('submission_xgb.csv', index=False)
print("Submission file created successfully!")


