In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from jupyterthemes import jtplot
jtplot.style(theme='grade3')

In [102]:
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle
import xgboost as xgb

In [97]:
class Prepare_Data(object):
    def __init__(self, filename="flight_delays_data.csv"):
        self.cnt_var = ['Week', 'std_hour', 'delay_time', 'is_claim']
        self.bin_var = ['Arrival', 'Airline']
        self.drop_var = ['Departure', 'flight_no']
        self.filename = filename

    def normalize_df(self, df):
        scaler = MinMaxScaler(feature_range=(0, 1))
        values = scaler.fit_transform(df.values)
        return pd.DataFrame(values, columns=df.columns.tolist(), index=df.index)      
    
    def cvt_bin(self, df):
        for _var in self.bin_var:
            print "converting bin var:", _var
            for item in df[_var].unique():
                try:
                    field_name = _var + "_" + item
                    df[field_name] = df[[_var]] == item
                except Exception as e:
                    print e
                    print "value name:", item
        return df

    def cvt_delay_time(self, df):
        times = []
        for t in df['delay_time']:
            if t == "Cancelled":
                times.append(100)
            else:
                times.append(float(t))
        df['delay_time'] = times
        self.delay_min = min(times)
        self.delay_max = max(times)
        return df

    def load_raw_data(self, additional_kwargs, time_series):
        kwargs = {}
        if time_series:
            kwargs.update({
                'parse_dates': {"dt": ['flight_date']},
                'infer_datetime_format': True,
                'index_col': 'dt'
            })
        else:
            kwargs.update({'index_col': 'flight_id'})
            
        kwargs.update(additional_kwargs)
        df = pd.read_csv(
            self.filename, 
            na_values=['NaN', '?','nan'], 
            **kwargs)
        return df
    
    def clean_df(self, df):
        df.drop(self.drop_var + self.bin_var, axis=1, inplace=True)
        df = self.cvt_delay_time(df)
        return df
        
    def load_data(self, additional_kwargs={}, time_series=False):
        df = self.load_raw_data(additional_kwargs, time_series)
        df = self.cvt_bin(df)
        df = self.clean_df(df)
        df = self.cvt_datetime(df)
        df = self.normalize_df(df)
        return df
    
    def cvt_datetime(self, df, dt_label="flight_date"):
        flight_dates = [datetime.datetime.strptime(str_dt, '%Y-%m-%d').date() for str_dt in df['flight_date'].values]
        df['flight_year'] = [dt.year for dt in flight_dates]
        df['flight_month'] = [dt.month for dt in flight_dates]
        df['flight_day'] = [dt.day for dt in flight_dates]
        df.drop(dt_label, axis=1, inplace=True)
        return df
    
    def build_train(self, df, label="delay_time"):
        return df.drop('delay_time', axis=1), df['delay_time']

In [90]:
ppd = Prepare_Data()

In [91]:
df = ppd.load_data(additional_kwargs={}, time_series=False)

converting bin var: Arrival
converting bin var: Airline
cannot concatenate 'str' and 'float' objects
value name: nan




In [169]:
x, y = ppd.build_train(df)

In [98]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=33)
for x in [train_x, test_x, train_y, test_y]:
    print x.shape

In [103]:
rf = RandomForestRegressor()
rf.fit(train_x, train_y)
results = rf.predict(test_x)

ma_r = mean_absolute_error(results, test_y.values) 
ms_r = mean_squared_error(results, test_y.values) 
print "mean absolution error:", ma_r
print "mean_squared_error:", ms_r

In [170]:
from sklearn.model_selection import cross_val_score
# clf = RandomForestRegressor(kernel='linear', C=1)
scores = cross_val_score(rf, x, y, cv=5)

In [174]:
scores.mean()

0.8738308174287444

In [178]:
pickle.dump(x, open('x.pkl', 'wb'))

In [179]:
pickle.dump(y, open('y.pkl', 'wb'))

In [112]:
with open('model_{}.pkl'.format(datetime.datetime.now().strftime("%H%M"), 'wb') as f:
    pickle.dump(rf, f)