In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

%matplotlib inline
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
train.head()

In [None]:
display(train.info())
train.describe()

In [None]:
#Univariate analysis
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
for c in train.columns:
    if train[c].dtype in ['object','bool']: #categorical column
        plt.figure()
        sns.countplot(train[c],palette='magma')
        plt.show()
    else: #numerical column
        plt.figure()
        sns.histplot(train[c],kde=(len(train[c].unique())>10),color='purple')
        plt.show()

In [None]:
#sns.pairplot(train, hue=train.columns[-1],palette='viridis') #too many variables to be very helpful
plt.figure(figsize=(20, 10))
sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1, cmap="coolwarm")
plt.show()

In [None]:
#Bivariate analysis
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
count = 0
for c in train.columns:
    if (train[c].dtype in ['object','bool']) or (count>=4): #categorical column
        plt.figure()
        sns.countplot(data=train,x=c,hue=train.columns[-1],palette='dark')
        plt.show()
    else: #numerical column
        count+=1
        plt.figure()
        sns.histplot(data=train,x=c,hue=train.columns[-1],element='step',palette='dark')
        plt.show()

In [None]:
import warnings #preprocessing
warnings.filterwarnings("ignore")
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
test = pd.read_csv("../input/airline-passenger-satisfaction/test.csv")
import seaborn as sns
import matplotlib.pyplot as plt
ytrain=train['satisfaction']
ytest=test['satisfaction']
train.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)
test.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)
#plt.figure(figsize=(20, 10))
#sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1, cmap="icefire")
#plt.show()
train.head()

In [None]:
display(train.columns)

In [None]:
logs=['Departure Delay in Minutes','Arrival Delay in Minutes'] #log transform due to distribution shape
for l in logs:
    train[l] = np.log(1+train[l])
    test[l] = np.log(1+test[l])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
for l in logs:
    plt.figure(figsize=(10,5))
    sns.histplot(train[l],color='orange')
    plt.show()

In [None]:
train=pd.get_dummies(train,drop_first=True) #encoding categorical variables
test =pd.get_dummies(test,drop_first=True)
display(train.head())
display(test.head())

In [None]:
ytrain=(ytrain=='satisfied').astype(int) #encoding ys
ytest=(ytest=='satisfied').astype(int)

In [None]:
sns.countplot(ytrain)
plt.figure()
sns.countplot(ytest)

In [None]:
display(train.info()) #Arrival Delay in minutes is missing some values
test.info()

In [None]:
#get rid of arrival delay due to extremely high correlation with departure delay
train.drop('Arrival Delay in Minutes',axis=1,inplace=True)
test.drop('Arrival Delay in Minutes',axis=1,inplace=True)
train.info()

In [None]:
#condensed preprocessing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
test = pd.read_csv("../input/airline-passenger-satisfaction/test.csv")
import seaborn as sns
import matplotlib.pyplot as plt
ytrain=train['satisfaction']
ytest=test['satisfaction']
train.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)
test.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)

logs=['Departure Delay in Minutes','Arrival Delay in Minutes'] #log transform due to distribution shape
for l in logs:
    train[l] = np.log(1+train[l])
    test[l] = np.log(1+test[l])
train=pd.get_dummies(train,drop_first=True) #encoding categorical variables
test =pd.get_dummies(test,drop_first=True)
ytrain=(ytrain=='satisfied').astype(int) #encoding ys
ytest=(ytest=='satisfied').astype(int)
train.drop('Arrival Delay in Minutes',axis=1,inplace=True)
test.drop('Arrival Delay in Minutes',axis=1,inplace=True)
from sklearn.preprocessing import StandardScaler
train = StandardScaler().fit_transform(train)
test = StandardScaler().fit_transform(test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
import random
random.seed(42)
names = [
    "LR",
    "KNN",
    "DTree",
    "RF",
    "ADA",
    "NB",
    "QDA",
]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

train = StandardScaler().fit_transform(train)
test = StandardScaler().fit_transform(test)
scores = []
for i,c in enumerate(classifiers):
    c.fit(train,ytrain)
    y_pred = c.predict(test)
    f=f1_score(ytest, y_pred)
    print([names[i],f])
    scores.append(f)

In [None]:
plt.figure(figsize=(20,10))
plt.grid()
plt.title("Base model performance")
sns.pointplot(names, scores)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
random.seed(42)
params = {
    'base_estimator': [DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=5),DecisionTreeClassifier()],
    'n_estimators': [50,100,150],
    'learning_rate' : [0.5,1]
    
}
clf = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=params, scoring='f1',verbose=3)
clf.fit(train,ytrain)

In [None]:
clf.best_params_
#depth=5, lr=0.5, n=50, .952

In [None]:
random.seed(42)
params = {
    'base_estimator': [RandomForestClassifier()],
    'n_estimators': [50,100,150,200,300,1000],
    'learning_rate' : [0.5,1,1.5]
}
grid = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=params, scoring='f1',verbose=3)
grid.fit(train,ytrain)

In [None]:
grid.best_params_
#1.5,300,.957

In [None]:
random.seed(42)
params = {
    'base_estimator': [AdaBoostClassifier(base_estimator=RandomForestClassifier())],
    'n_estimators': [50,100,150,200,300,1000],
    'learning_rate' : [0.5,1]
}
grid2 = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=params, scoring='f1',verbose=3)
grid2.fit(train,ytrain)
grid2.best_params_ # 0.5, 200

In [None]:
random.seed(42)
model = AdaBoostClassifier(base_estimator=RandomForestClassifier(),learning_rate=0.5,n_estimators=200)
model.fit(train,ytrain)
y_pred = model.predict(test)
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))

In [None]:
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
train.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)
train['Departure Delay in Minutes'] = np.log(1+train[l])
train=pd.get_dummies(train,drop_first=True) #encoding categorical variables
train.drop('Arrival Delay in Minutes',axis=1,inplace=True)
plt.style.use('seaborn-dark-palette')
plt.figure(figsize=(10,5))
pd.Series(model.feature_importances_, index=train.columns)[:10].sort_values().plot(kind='barh')

In [None]:
#Base Model: 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
test = pd.read_csv("../input/airline-passenger-satisfaction/test.csv")
import seaborn as sns
import matplotlib.pyplot as plt
ytrain=train['satisfaction']
ytest=test['satisfaction']
train.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)
test.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)

logs=['Departure Delay in Minutes','Arrival Delay in Minutes'] #log transform due to distribution shape
for l in logs:
    train[l] = np.log(1+train[l])
    test[l] = np.log(1+test[l])
train=pd.get_dummies(train,drop_first=True) #encoding categorical variables
test =pd.get_dummies(test,drop_first=True)
ytrain=(ytrain=='satisfied').astype(int) #encoding ys
ytest=(ytest=='satisfied').astype(int)
train.drop('Arrival Delay in Minutes',axis=1,inplace=True)
test.drop('Arrival Delay in Minutes',axis=1,inplace=True)
from sklearn.preprocessing import StandardScaler
train = StandardScaler().fit_transform(train)
test = StandardScaler().fit_transform(test)
x_train=train
x_test=test
y_train=ytrain
y_test=ytest
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
predictions = model.predict(x_test)

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print("Classification report:")
print(classification_report(y_test, predictions))
accuracy = accuracy_score(y_test,predictions)
print("Accuracy: ", accuracy)
print("Confusion matrix:")
print(confusion_matrix(y_test, predictions))

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_test, predictions))
accuracy = accuracy_score(y_test,predictions)
print("Accuracy: ", accuracy)
print(confusion_matrix(y_test, predictions))


#Tuning: 
import random
from sklearn.model_selection import GridSearchCV 
random.seed(40)
params = {'criterion': ['gini', 'entropy'], 
          'splitter': ['best', 'random'], 
          'max_depth': [10, 20 ,50, 200, None]}

gcv = GridSearchCV(estimator = DecisionTreeClassifier(),param_grid = params, scoring = 'f1', verbose = 3 )
gcv.fit(x_test, y_test)



In [None]:
from sklearn.ensemble import RandomForestClassifier

random.seed(42)
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(train, ytrain)

y_pred = rf_classifier.predict(test)

from sklearn.metrics import classification_report

print(classification_report(ytest, y_pred))

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 

param_grid = {
    'max_depth': [80, 90, 100],
    'n_estimators': [100, 200, 300],
    'max_features': [2,3]
}

random.seed(42)
grid_search = GridSearchCV(estimator = rf_classifier, param_grid = param_grid, scoring= 'f1',verbose = 3)

grid_search.fit(train, ytrain)
est_grid = grid_search.best_params_

print(est_grid)

rf_classifier = RandomForestClassifier(random_state=42, max_depth= 80, n_estimators=300, max_features=3)
rf_classifier.fit(train, ytrain)

y_pred = rf_classifier.predict(test)

from sklearn.metrics import classification_report

print(classification_report(ytest, y_pred))


In [None]:
# MODEL

from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
test = pd.read_csv("../input/airline-passenger-satisfaction/test.csv")
ytrain=train['satisfaction']
ytest=test['satisfaction']
train.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)
test.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)

logs=['Departure Delay in Minutes','Arrival Delay in Minutes'] #log transform due to distribution shape
for l in logs:
    train[l] = np.log(1+train[l])
    test[l] = np.log(1+test[l])
train=pd.get_dummies(train,drop_first=True) #encoding categorical variables
test =pd.get_dummies(test,drop_first=True)
ytrain=(ytrain=='satisfied').astype(int) #encoding ys
ytest=(ytest=='satisfied').astype(int)
train.drop('Arrival Delay in Minutes',axis=1,inplace=True)
test.drop('Arrival Delay in Minutes',axis=1,inplace=True)
from sklearn.preprocessing import StandardScaler
train = StandardScaler().fit_transform(train)
test = StandardScaler().fit_transform(test)
import random
from sklearn.model_selection import GridSearchCV

parameters = {'penalty':['l1','l2','elasticnet'],
          'C':[0.01,0.1,0.2,0.3,0.5,0.7,1,1.2,1.5,2,3,5,10],
             'solver':['saga']}

grid_search = GridSearchCV(estimator = LogisticRegression(),  
                       param_grid = parameters,
                       scoring = 'f1',
                       verbose = 3)

grid_search.fit(train, ytrain)
print(grid_search.best_params_)

# FINAL F1 SCORE: 0.8516424340333874

from sklearn.metrics import f1_score

f = f1_score(ytest, predictions)
print("Score", f)

# PARAMETERS USING MODEL COEFF FOR LOGREG

train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
train.drop(['satisfaction','Unnamed: 0','id'], axis = 1, inplace = True)
train['Departure Delay in Minutes'] = np.log(1+train[l])
train = pd.get_dummies(train, drop_first = True)
train.drop('Arrival Delay in Minutes', axis = 1, inplace = True)
plt.style.use('seaborn-dark-palette')
plt.figure(figsize = (10,5))
pd.Series(model.coef_[0], index = train.columns)[:10].sort_values().plot(kind = 'barh')



In [None]:
#KNN
from sklearn.model_selection import GridSearchCV
import random
random.seed(42)
from sklearn.neighbors import KNeighborsClassifier
params = {
    "n_neighbors" : [5,10,20,50,100,200],
    "weights" : ['uniform','distance'],
}
clf = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params, scoring='f1',verbose=3)
clf.fit(train,ytrain)
clf.best_params_

In [None]:
model = KNeighborsClassifier(n_neighbors=10, weights='distance')
model.fit(train, ytrain)
y_pred = model.predict(test)
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))

In [None]:
model = LogisticRegression(C=0.01, penalty= 'l1',solver= 'saga')
model.fit(train, ytrain)
y_pred = model.predict(test)
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))

In [None]:
#condensed preprocessing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
test = pd.read_csv("../input/airline-passenger-satisfaction/test.csv")
import seaborn as sns
import matplotlib.pyplot as plt
ytrain=train['satisfaction']
ytest=test['satisfaction']
train.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)
test.drop(['satisfaction','Unnamed: 0','id'],axis=1,inplace=True)

logs=['Departure Delay in Minutes','Arrival Delay in Minutes'] #log transform due to distribution shape
for l in logs:
    train[l] = np.log(1+train[l])
    test[l] = np.log(1+test[l])
train=pd.get_dummies(train,drop_first=True) #encoding categorical variables
test =pd.get_dummies(test,drop_first=True)
ytrain=(ytrain=='satisfied').astype(int) #encoding ys
ytest=(ytest=='satisfied').astype(int)
train.drop('Arrival Delay in Minutes',axis=1,inplace=True)
test.drop('Arrival Delay in Minutes',axis=1,inplace=True)
from sklearn.preprocessing import StandardScaler
train = StandardScaler().fit_transform(train)
test = StandardScaler().fit_transform(test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics
#https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
import random
random.seed(42)
names = [
    "LR",
    "KNN",
    "DTree",
    "RF",
    "ADA"
]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=10, weights='distance'),
    DecisionTreeClassifier(criterion = 'entropy', max_depth = 20, splitter = 'random'),
    RandomForestClassifier(random_state=42, max_depth= 80, n_estimators=300, max_features=3),
    AdaBoostClassifier(base_estimator=RandomForestClassifier(),learning_rate=0.5,n_estimators=200)
]

train = StandardScaler().fit_transform(train)
test = StandardScaler().fit_transform(test)

In [None]:
import time
for i,c in enumerate(classifiers): #roc curves from https://www.statology.org/plot-roc-curve-python/
    t = time.time()
    c.fit(train,ytrain)
    y_pred_proba=c.predict_proba(test)[::,1]
    print(names[i],time.time()-t)
    fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
    plt.figure()
    #create ROC curve
    plt.plot(fpr,tpr)
    line = np.linspace(0,1,100)
    plt.plot(line,line)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.grid()
    plt.title(names[i])
    plt.show() #0,0,37,17