In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/avia-company-satisfaction/sample_submission.csv
/kaggle/input/avia-company-satisfaction/train_dataset.csv
/kaggle/input/avia-company-satisfaction/test_dataset.csv


# **Our task is to predict wheather the customer of Avia Company is safisfied from sevices.**
#   
# **Let's start with import all libaries those we need.**

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy  as np

# **First we download the train dataset.**

In [3]:
df = pd.read_csv('/kaggle/input/avia-company-satisfaction/train_dataset.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Male,disloyal Customer,33,Business travel,Eco,571,2,3,2,4,...,4,3,1,3,4,3,4,10,3.0,0
2,Female,Loyal Customer,49,Business travel,Business,1431,4,1,4,4,...,5,5,5,5,3,5,3,0,0.0,1
3,Female,Loyal Customer,43,Business travel,Eco,867,1,4,4,4,...,1,1,1,1,1,1,2,0,18.0,0
4,Female,Loyal Customer,27,Business travel,Business,1550,3,3,3,3,...,2,4,4,5,5,4,2,0,0.0,1
5,Male,Loyal Customer,11,Personal Travel,Eco,526,3,4,3,2,...,4,5,2,5,3,5,4,0,10.0,0


# **Then we split it into train and test sets.**

In [4]:
stratified_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

for train_index, test_index in stratified_splitter.split(df, df['satisfaction']):
    train_set = df.iloc[train_index]
    test_set = df.iloc[test_index]

# **We should split both train and test data into X and y parts.**

In [5]:
X_train = train_set.drop(['satisfaction'], axis=1)
X_test  = test_set.drop(['satisfaction'], axis=1)

y_train = train_set['satisfaction'].copy()
y_test  = test_set['satisfaction'].copy()

# **Then we create pipeline with model.**

In [6]:
numeric_columns = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=1000,
        criterion='gini',
        min_samples_split=3,
        min_samples_leaf=1,
        random_state=80
    )),
    ('xgb', XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=50,
        min_child_weight=5,
        subsample=1.0,
        colsample_bytree=1.0,
        gamma=1,
        reg_alpha=0,
        reg_lambda=50,
        scale_pos_weight=70,
        random_state=80
    ))
]

stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', stacking_model)
])

pipeline.fit(X_train, y_train)

stacking_predictions = pipeline.predict(X_test)

stacking_accuracy = accuracy_score(y_test, stacking_predictions)
print(f"Stacking Model Accuracy: {stacking_accuracy}")

Stacking Model Accuracy: 0.954


# Now you can see accuracy of our model is 95.4%, the treshold was 95%.

# It is time to download test dataset, predict it, and then prepare results to submission.

In [7]:
test_df = pd.read_csv('/kaggle/input/avia-company-satisfaction/test_dataset.csv', index_col=0)

test_prediction = pipeline.predict(test_df)

submission_df = pd.DataFrame({
    'id': test_df.index,
    'satisfaction': test_prediction
})

submission_df.to_csv('submission.csv', index=False)