# Airline model development using scikitlearn 

### Read data from data asset
- Please provide the name of the csv you have created in the pre-processing step.

In [11]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

df = pd.read_csv('/project_data/data_asset/train_flights_jan_2015.csv') # change the file name
df.head()

Unnamed: 0,DAY,DAY_OF_WEEK,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_DELAY,TAXI_OUT,DISTANCE,DELAYED
0,23,5,MSP,PHX,-2.0,13.0,1276,0
1,2,5,RAP,SLC,-6.0,32.0,508,0
2,8,4,STX,MIA,-5.0,9.0,1139,0
3,16,5,CLT,ATL,6.0,14.0,226,0
4,11,7,ATL,BDL,0.0,18.0,859,0


### Seperate featues and label

In [12]:
X=df.drop('DELAYED',axis=1)
y=df['DELAYED']

#### Seperate categorical and numerical columns

In [13]:
cat=["DAY","DAY_OF_WEEK","ORIGIN_AIRPORT","DESTINATION_AIRPORT"]
numeric=['DEPARTURE_DELAY','TAXI_OUT',"DISTANCE"]

### Create preprocessor for categorical and numerical columns

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat)])



## Create a pipeline with the preprocessor and an estimator

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

#clf=GradientBoostingClassifier();
clf=LogisticRegression()
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf)])

### K-fold cross validation

In [16]:
# For Bixi as Model Developer
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipe, X, y, cv=5)
scores.mean()

0.9133772530075503

### Fit the model 

In [17]:
model=pipe.fit(X,y);

## Test prediction

In [18]:
sdf=pd.DataFrame([[11,7,"MSP","PHX",2,11,570],[10,4,"MSP","PHX",20,50,600]],columns=X.columns)
sdf

Unnamed: 0,DAY,DAY_OF_WEEK,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_DELAY,TAXI_OUT,DISTANCE
0,11,7,MSP,PHX,2,11,570
1,10,4,MSP,PHX,20,50,600


In [19]:
model.predict(sdf)

array([0, 1])

## Store model to data asset

In [20]:
import pickle;
pickle.dump(model,open('/project_data/data_asset/airline-sklearn.pkl','wb'))