## 1. Prerequisites on libraries

In [None]:
## Library import
import os, sys
import datetime

## Libraries for basic data-processing & visualization
import numpy as np
import pandas as pd
!pip install -U seaborn
import seaborn as sns

## Libraries for decision-tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from IPython.display import Image
import graphviz
from sklearn.externals.six import StringIO
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38/bin/' # for adding to PATH
try:
    import pydotplus
except:
    !pip install pydotplus
    import pydotplus
    
## Libraries for logistic-regression
from sklearn.linear_model import LogisticRegression

## Library for evaluation on classification
from sklearn.metrics import confusion_matrix, f1_score

## Libraries for Keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

## Random seed for reproductivity, especially for Keras
from numpy.random import seed
seed(0)
from tensorflow import set_random_seed
set_random_seed(0)

import matplotlib.pyplot as plt

import timeit

## import opencv for python
try:
    import cv2
except:
    !pip install opencv-python
    import cv2

from scipy import ndimage
try:
    from joblib import Parallel, delayed
except:
    !pip install joblib
    from joblib import Parallel, delayed    
    
%matplotlib inline

In [None]:
WKDIR = '../data/'
os.chdir(WKDIR)

## 2. Load data & its definition

In this chapter, we load raw data from csv-format data. Original data is [this site](https://www.kaggle.com/johndddddd/customer-satisfaction).

Also, we review the basic functions on pandas & numpy in comparing SQL syntax. Please note in advance that they don't cover all functions.

In [None]:
## Data import
SATISFACTION_FILE = 'satisfaction.csv'
df = pd.read_csv(SATISFACTION_FILE)

In [None]:
## Size of DataFrame (row numbers, column numbers)
df.shape

In [None]:
## Confirm data columns
df.columns

### Definition of each column


|Column|Description|Data examples|
|-----|:-----:|:-----|
|satisfaction_v2|Airline satisfaction level|Satisfaction, neutral or dissatisfaction|
|Gender|Gender of the passengers |Female, Male|
|Customer Type|The customer type|Loyal customer, disloyal customer|
|Age|The actual age of the passengers||
|Type of Travel|Purpose of the flight of the passengers|Personal Travel, Business Travel|
|Class|Travel class in the plane of the passengers|Business, Eco, Eco Plus|
|Flight Distance|The flight distance of this journey||
|Seat comfort|Satisfaction level of Seat comfort||
|Departure/Arrival time convenient|Satisfaction level of Departure/Arrival time convenient||
|Food and drink|Satisfaction level of Food and drink||
|Gate location|Satisfaction level of Gate location||
|Inflight wifi service|Satisfaction level of the inflight wifi service|0:Not Applicable;1-5|
|Inflight entertainment|Satisfaction level of inflight entertainment||
|Online support|??||
|Ease of Online booking|Satisfaction level of online booking||
|On-board service|Satisfaction level of On-board service||
|Leg room service|Satisfaction level of Leg room service||
|Baggage handling|Satisfaction level of baggage handling||
|Checkin service|Satisfaction level of Check-in service||
|Cleanliness|Satisfaction level of Cleanliness||
|Online boarding|Satisfaction level of online boarding||
|Departure Delay in Minutes|Minutes delayed when departure||
|Arrival Delay in Minutes|Minutes delayed when Arrival|



Ref. https://www.kaggle.com/johndddddd/customer-satisfaction/home

### 2.0.1 Projection

If SQL, ...
```SQL
SELECT
    TOP(10) [Flight Distance]
FROM
    TABLE_SATISFACTION
```

In [None]:
df['Flight Distance'].head(10)

### 2.0.2 Projection with plural columns
If SQL, ...
```SQL
SELECT
    TOP(5) [Flight Distance], [Type of Travel]
FROM
    TABLE_SATISFACTION
```

In [None]:
df[['Flight Distance', 'Type of Travel']].head()

### 2.0.3 Select distinct

If SQL, ...

```SQL
SELECT
    DISTINCT [Flight Distance]
FROM
    TABLE_SATISFACTION
```

In [None]:
df['Flight Distance'].unique()

### 2.0.4 filter

If SQL, ...

```SQL
SELECT
    TOP(10) *
FROM
    TABLE_SATISFACTION
WHERE
    Gender = 'Female'
```

In [None]:
df.query("Gender == 'Female'").head(10)

### 2.0.5 order by

If SQL, ...
```SQL
SELECT
    *
FROM
    TABLE_SATISFACTION
ORDER BY
    [Flight Distance]
```

In [None]:
df.sort_values('Flight Distance').head(10)

### 2.0.6 group by & aggregate function

If SQL, ...
```SQL
SELECT
    AVG([Flight Distance])
    ,[Type of Travel]
FROM
    TABLE_SATISFACTION
GROUP BY
    [Type of Travel]
```

In [None]:
df[['Flight Distance', 'Type of Travel']].groupby('Type of Travel').mean()

### 2.0.7 Case statement

If SQL, ...
```SQL
SELECT
    CASE [satisfaction_v2]
        WHEN 'satisfied' THEN 1
        ELSE 0
    END AS [target]
FROM
    TABLE_SATISFACTION
```

In [None]:
df['target'] = df['satisfaction_v2'].apply(lambda x: 1 if x == 'satisfied' else 0)

### 2.0.8 group by & count

If SQL, ...
```SQL
SELECT
    COUNT(1)
    ,[Type of Travel]
FROM
    TABLE_SATISFACTION
GROUP BY
    [Type of Travel]
```

In [None]:
df['Type of Travel'].value_counts()

### 2.0.9 Populate dummy variables

If SQL, ...
```SQL
SELECT
    CASE [Type of Travel]
        WHEN 'Business trave' THEN 1
        ELSE 0
    END AS [Business trave]
    ,CASE [Type of Travel]
        WHEN 'Personal Travel' THEN 1
        ELSE 0
    END AS [Personal Travel]
FROM
    TABLE_SATISFACTION
```

In [None]:
pd.get_dummies(df['Type of Travel']).head()

### 2.0.10 pivot table

In [None]:
pd.pivot_table(df, values=['Flight Distance', 'Seat comfort', 'Online boarding'], index='Type of Travel', aggfunc='mean')

## 2.1 Slice on dataframe

In [None]:
## Extract first 10 rows
## Attention!! index of python starts from 0.
df[:10]

In [None]:
## Extract 15th row to 20th row
df[15:20]

In [None]:
## Use iloc, If we focus on specific columns
df.iloc[15:20, [4]]

In [None]:
## Or, slice after projection by pandas
df['Age'][15:20]

In [None]:
## The above 2 objects are different, though...
print(type(df.iloc[15:20, [4]]))
print(type(df['Age'][15:20]))

## 2.2 Basic summary

### 2.2.1 Basic statistics for each column

In [None]:
df.describe().T

### 2.2.2 Check 'NA'(Not Applicable) value for each column

In [None]:
df.isnull().apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(np.int).T

## 2.3 Viaualize & understand data

### 2.3.1 Check distribution on single colum

In [None]:
## Visualize Flight distance
sns.distplot(df['Flight Distance'], kde=True)

In [None]:
## To-Do: Put another column name on argument of the dataframe 
col_single = 'Food and drink'
sns.distplot(df[col_single], kde=False)

### 2.3.2 Understand relation of several columns

In [None]:
## Calculate correlational coefficient
np.corrcoef(df['Seat comfort'], df['Food and drink'])[1,0]

In [None]:
## To-Do: ut other columns on argument of calculating
col1_corr = 'Inflight wifi service'
col2_corr = 'Inflight entertainment'

np.corrcoef(df[col1_corr], df[col2_corr])[1,0]

In [None]:
## Correlation matrix for some variables

col_corr = ['Flight Distance'
            ,'Seat comfort'
            ,'Food and drink'
            ,'Inflight wifi service'
            ,'Inflight entertainment'
            ,'Online support'
            ,'Cleanliness'
            ,'Ease of Online booking'
            ,'Departure Delay in Minutes']

plt.figure(figsize=(16,10))
sns.heatmap(df[col_corr].corr(), annot=True)

### 2.3.3 Scatter plot on 2 variables

In [None]:
## Try to depict scatter plot, ...
sns.scatterplot(df['Food and drink'], df['Leg room service']) ## variant is too small ..:(

In [None]:
## with density for each observation
sns.jointplot(df['Food and drink'], df['Leg room service'], kind="kde")

In [None]:
## Calculate correlational coefficient
np.corrcoef(df['Food and drink'], df['Leg room service'])[1,0]

In [None]:
def depict_corr_matrix(col1, col2):
    ## Calculate & print correlation coefficient
    corr = np.corrcoef(df[col1], df[col2])[1,0]
    print('Correlational coefficient is {}'.format(corr))
    ## Depict joint plot
    sns.jointplot(df[col1], df[col2], kind="kde")

In [None]:
## In summary, ...
depict_corr_matrix('Food and drink', 'Leg room service')

In [None]:
## Try another combination
depict_corr_matrix('Inflight wifi service', 'Inflight entertainment')

In [None]:
## Try another combination
depict_corr_matrix('Food and drink', 'Seat comfort')

## 3. Machine Learning

In this chapter, we consider statistical model, which predict passengers' satisfaction with other variables.

- Goal of this chapter
  - Select some given variables and confirm which variable impacts the satisfaction as a whole.
    - Decision Tree Classifier
    - Logistic Classifier

- Notes:
  - In order to move forward, we put `target` as target variable: 1: `satisfied`, 0: `neutral or dissatisfied`
  - In next chapter, we pursuit the predictivity with deep-learning technique and compare the accuracy with logistic classifier.

## 3.1 Decision Tree classifier

In [None]:
col_ml= ['Flight Distance'
        ,'Seat comfort'
        ,'Food and drink'
        ,'Inflight wifi service'
        ,'Inflight entertainment'
        ,'Online support'
        ,'Ease of Online booking'
        ,'Departure Delay in Minutes']

In [None]:
## Define decision tree model
clf = DecisionTreeClassifier(random_state=0, max_depth=4)

In [None]:
## Split the whole data into train & test
Obs_for_train = 120000
X_train = df[col_ml][:Obs_for_train]
y_train = df['target'][:Obs_for_train]
X_test = df[col_ml][Obs_for_train:]
y_test = df['target'][Obs_for_train:]

In [None]:
## Optimize parameters of the model
dt = clf.fit(X_train, y_train)

In [None]:
## Calculate accuracy
accuracy_dt = dt.score(X_test, y_test)
print(accuracy_dt)

In [None]:
## Check feature importance for eadh variable
dt.feature_importances_

In [None]:
sns.set()
plt.figure(figsize=(15,10))
sns.barplot(col_ml, dt.feature_importances_)

In [None]:
## F1 score
f1_score(y_test, dt.predict(X_test))

In [None]:
## Depict tree
dot_data = StringIO()
tree.export_graphviz(dt 
                     ,out_file=dot_data
                     ,feature_names=col_ml
                     ,filled=True
                     ,rounded=True
                     ,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

## 3.2 Logistic regression

In [None]:
Image(url= "https://www.cntk.ai/jup/logistic_neuron.jpg", width=300, height=200)

In [None]:
## Define logistic regression
clf_log = LogisticRegression(random_state=0, solver='lbfgs', max_iter=100)

In [None]:
clf_log.fit(X_train, y_train)

In [None]:
## Calculate accuracies
accuracy_lr = clf_log.score(X_test, y_test)
print(accuracy_lr)

In [None]:
## Each partial regression coefficient
clf_log.coef_

In [None]:
## Compare partial regression coefficients
sns.set()
plt.figure(figsize=(15,10))
sns.barplot(col_ml, clf_log.coef_[0])
plt.title('Partial regression coefficient on logistic regression')

In [None]:
## Confusiton matrix
confusion_matrix(y_test, clf_log.predict(X_test))

In [None]:
## F1 score
f1_score(y_test, clf_log.predict(X_test))

## 3.3 Deep Learning

- In this chapter, we'd like to pursuit predictive accuracy with deep-learning technique, especially MLP(=Multi-Layer Perceptron). Please treat this content as introduction for deep-learning.

### 3.3.1 Equivalent to logistic regression

In [None]:
model1 = Sequential()
model1.add(Dense(1, input_dim=X_train.shape[1], activation='sigmoid'))

model1.compile(loss='binary_crossentropy'
             ,optimizer='adam'
             ,metrics=['accuracy'])

In [None]:
history1 = model1.fit(X_train, y_train, epochs=100, batch_size=1001, verbose=1, shuffle=False)

In [None]:
plt.plot(history1.history['loss'], label='train')
plt.legend()
plt.xlabel('epoch number')
plt.ylabel('loss')
plt.show()

In [None]:
loss1, accuracy_dl1 = model1.evaluate(X_test, y_test)
print('loss: {}, accuracy: {}'.format(loss1, accuracy_dl1))

In [None]:
model1.summary()

### 3.3.2 MLP -- Adding layers

In [None]:
model2 = Sequential()
model2.add(Dense(5, input_dim=X_train.shape[1], activation='relu'))
model2.add(Dense(5, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy'
             ,optimizer='adam'
             ,metrics=['accuracy'])

In [None]:
history2 = model2.fit(X_train, y_train, epochs=100, batch_size=1000, verbose=1, shuffle=False)

In [None]:
plt.plot(history2.history['loss'], label='train')
plt.legend()
plt.xlabel('epoch number')
plt.ylabel('loss')
plt.show()

In [None]:
loss2, accuracy_dl2 = model2.evaluate(X_test, y_test)
print('loss: {}, accuracy: {}'.format(loss2, accuracy_dl2))

In [None]:
model2.summary()

## 3.4 Compare accuracies for all models

In [None]:
model_name = ['Decision Tree', 'Logistic regression', 'equivalent to Log-reg', 'MLP']
sns.set()
plt.figure(figsize=(15,10))
sns.barplot(model_name, [accuracy_dt, accuracy_lr, accuracy_dl1, accuracy_dl2])

## 4.0 Tips on execution

### 4.1 Save used memory

In [None]:
## Confirm current data sets
sys.getsizeof(X_train) # 7680104 bytes

In [None]:
## Convert sparse format
X_train_sparse = pd.get_dummies(X_train).to_sparse(fill_value=0)

In [None]:
## Confirm converted data sets
sys.getsizeof(X_train_sparse) # 7028848 bytes

In [None]:
def ck_accuracy(X_train, y_train, X_test, y_test):
    ## Optimize parameters of the model
    dt = clf.fit(X_train, y_train)

    ## Calculate accuracy
    accuracy_dt = dt.score(X_test, y_test)
    print(accuracy_dt)

In [None]:
## Original data sets
ck_accuracy(X_train, y_train, X_test, y_test)

## Converted data sets
ck_accuracy(X_train_sparse, y_train, X_test, y_test)

### 4.2 Check process time

In [None]:
## Define sample function
def test(n):
    sum(range(n))

In [None]:
## Primitive method - difference between snart & end

n=1000000

start_time = datetime.datetime.now()

test(n)

end_time = datetime.datetime.now()
process_time = end_time - start_time

print('Total process time is {}'.format(process_time))

In [None]:
%%timeit  #Magic function for iPython Notebook(=Jupyter Notebook)
test(n)

### 4.3 Parallel process

In [None]:
#### Definition of rotation
rIntr = 30
## Start angle
rs = 30
## Final angle
re = 330

## input images
Image_DIR = '../data/'
## output images
output_DIR = '../data/output/'

In [None]:
def getFilesInDirectory(directory, postfix = ""):
    fileNames = [s for s in os.listdir(directory) if not os.path.isdir(os.path.join(directory, s))]
    if not postfix:
        return fileNames
    else:
        return [s for s in fileNames if s.lower().endswith(postfix)]

## mirror inversion
def generate_flip(I):
    fimg = I.copy()
    return cv2.flip(fimg ,0)   

## Generate(rotate) images & store for each image
def generate_rot(imgFilename, rs, re, rIntr, output_DIR, image_type):
    I = cv2.imread(imgFilename)
    ## mirror-inversion, if needed
    if image_type == 'Reverse':       
        I = generate_flip(I)
    for r in range(rs, re+1, rIntr):
        ## Rotation of each image
        Irot = ndimage.rotate(I, r, reshape=False)
        ## Store images
        format_r = '{0:03d}'.format(r)
        FILENAME = imgFilename+'_'+str(image_type)+'_'+str(format_r)+'.jpg'
        print('Completed for '+str(FILENAME))
        cv2.imwrite(os.path.join(output_DIR,FILENAME), Irot)

## Present all images in a folder
def show_images(imgFilenames, path, image_Num=10):
    for ind, img in enumerate(imgFilenames):
        if ind <= image_Num:
            print(img)
            image = cv2.imread(path+img)
            decoded_bytes = cv2.imencode('.jpg', image)[1].tobytes()
            display(Image(data=decoded_bytes, width=300))

In [None]:
## Get file names of Input images
os.chdir(Image_DIR)
imgFilenames = getFilesInDirectory(Image_DIR, ".jpg")

## Make directory for output
try:
    os.mkdir(output_DIR)
except:
    pass

In [None]:
## Confirm the imported images
show_images(imgFilenames, Image_DIR)

In [None]:
## Check time lapse in rotating images with single processing
start = datetime.datetime.now()

for imgFilename in imgFilenames:
    generate_rot(imgFilename=imgFilename, 
                                    rs=rs, 
                                    re=re, 
                                    rIntr=rIntr, 
                                    output_DIR=output_DIR, 
                                    image_type='Normal')

process_time = datetime.datetime.now() - start
print('Process time with single process: {}'.format(process_time))

In [None]:
## Check time lapse in rotating images with parallel processing
start = datetime.datetime.now()
Parallel(n_jobs=-1, verbose=3,
    backend="threading"
    )([delayed(generate_rot)(imgFilename=imgFilename, 
                                    rs=rs, 
                                    re=re, 
                                    rIntr=rIntr, 
                                    output_DIR=output_DIR,
                                    image_type='Normal') 
#                                    image_type='Reverse') ## if mirror inversion
                                   for imgFilename in imgFilenames])
process_time = datetime.datetime.now() - start
print('Process time with parallel process: {}'.format(process_time))

In [None]:
## Confirm the generated images
out_imgFilenames = getFilesInDirectory(output_DIR, ".jpg")
show_images(out_imgFilenames, output_DIR)

## 5. References

- pandas: https://pandas.pydata.org/
- seaborn: https://seaborn.pydata.org/
- scikit-learn
    - decision-tree: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
    - logistic regression: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    - metrics for classification: http://scikit-learn.org/stable/modules/classes.html#classification-metrics
- keras: https://keras.io/
- Magic function in iPython Notebook: https://ipython.readthedocs.io/en/stable/interactive/tutorial.html#magic-functions
- joblib: https://pypi.org/project/joblib/
- cv2(opencv for python): https://pypi.org/project/opencv-python/