<a href="https://colab.research.google.com/github/leman-cap13/my_projects/blob/main/Airline_Passenger_Satisfaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download teejmahal20/airline-passenger-satisfaction

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/content/airline-passenger-satisfaction.zip', 'r')
zip_ref.extractall()


In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
df=pd.read_csv('/content/train.csv')
df

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['satisfaction']=le.fit_transform(df['satisfaction'])
df.head()

In [None]:
df.corr(numeric_only=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
plt.show()

In [None]:
df.columns

In [None]:
df.drop('id', inplace=True, axis=1)

In [None]:
df.drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X=df.drop('satisfaction', axis=1)
y=df['satisfaction'].copy()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


In [None]:
num_feature=X_train.select_dtypes(include=[np.number]).columns
cat_feature=X_train.select_dtypes(exclude=[np.number]).columns

In [None]:
num_pipeline=Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline=Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

transformer=ColumnTransformer([
    ('num', num_pipeline, num_feature),
    ('cat', cat_pipeline, cat_feature)
], remainder='passthrough')

estimator=RandomForestClassifier(random_state=42)

full_pipeline=Pipeline([
    ('preprocessing', transformer),
    ('estimator', estimator)
])


In [None]:
full_pipeline.fit(X_train, y_train)

In [None]:
full_pipeline.score(X_train, y_train),full_pipeline.score(X_test, y_test)

In [None]:
full_pipeline.predict(X_test[:10])

In [None]:
preprocessor = full_pipeline.named_steps['preprocessing']


feature_names = preprocessor.get_feature_names_out()


importances = full_pipeline.named_steps['estimator'].feature_importances_

feat_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)


plt.figure(figsize=(10, 6))
plt.barh(feat_importance['Feature'], feat_importance['Importance'])
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.title("Feature Importance")
plt.show()


In [None]:
y_test[:10]

In [None]:
y_pred=full_pipeline.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
cm=confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

#Grid SearchCV

In [None]:
param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [10,20,30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train[:2000], y_train[:2000])

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
best_model = grid_search.best_estimator_


#Logistic Regression

In [None]:
estimator2=LogisticRegression(random_state=42)

full_pipeline_2=Pipeline([
    ('preprocessing', transformer),
    ('estimator2', estimator2)
])

In [None]:
full_pipeline_2.fit(X_train, y_train)

In [None]:
full_pipeline_2.score(X_train, y_train),full_pipeline_2.score(X_test, y_test)

In [None]:
y_prediction=full_pipeline_2.predict(X_test)

In [None]:
accuracy_score(y_test, y_prediction)

In [None]:
precision_score(y_test, y_prediction)

In [None]:
recall_score(y_test, y_prediction)

In [None]:
f1_score(y_test, y_prediction)

In [None]:
cm=confusion_matrix(y_test, y_prediction)
cm

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

#Support Vector Classification

In [None]:
estimator3=SVC(random_state=42)

full_pipeline_3=Pipeline([
    ('preprocessing', transformer),
    ('estimator3', estimator3)
])

In [None]:
full_pipeline_3.fit(X_train, y_train)

In [None]:
full_pipeline_3.score(X_train, y_train),full_pipeline_3.score(X_test, y_test)