In [1]:
import pandas as pd
import numpy as np
import glob
import requests
import json
import gc
import pickle
import concurrent.futures
RND_STATE = 100412

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

## Configuration

In [3]:
# weather links
# ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt

In [4]:
WEATHER_API_KEY = 'hyOjPAbXyAQlOtisZXUEdVLrfigDamkG'

In [5]:
STARTING_DATE = '2017-03-01'
END_DATE = '2018-03-01'

In [6]:
DATA_FOLDER = '../data/historical_data'
DICT_FOLDER = '../data/dictionaries'
WEATHER_FOLDER = '../data/weather_data'

In [7]:
DATA_FILE = '../data/merged_data.csv'

In [8]:
DATA_PICKLE = '../data/merged_data.data'

## Merging and loading data

### Loading flights data

In [54]:
def get_file_list(folder_name, prefix=''):
    return glob.glob(folder_name + '/*' + prefix + '.csv')

In [55]:
def read_csv(file_name):
    file_data = pd.read_csv(file_name, dtype={'CANCELLATION_CODE': str}, parse_dates=True)
    file_data['FL_DATE'] = pd.to_datetime(file_data.FL_DATE)
    return file_data

In [56]:
def read_csv_backup(file_name):
    file_data = pd.read_csv(file_name, dtype={'cancellation_code': str}, parse_dates=True)
    file_data['fl_date'] = pd.to_datetime(file_data.fl_date)
    file_data = file_data.drop(['Unnamed: 0'], axis = 1)
    return file_data

In [57]:
def gather_data(folder_name):
    file_list = get_file_list(folder_name)
    
    files_data = read_csv(file_list[0])
    for file in log_progress(file_list[1:], every=1):
        tmp = read_csv(file)
        files_data = pd.concat([files_data, tmp])
        del tmp
        
    files_data = files_data.reindex()
    files_data.columns = map(str.lower, files_data.columns)
    return files_data

In [58]:
def append_city_names(files_data, dict_folder_name):
    files_data_df = files_data.copy()
    city_info = pd.read_csv(dict_folder_name + '/city_codes_info.csv')
    city_info['Description'] =  city_info['Description'].str.replace(',.*|\/.*| City', '')
    
    city_info.columns = ['origin_city_market_id', 'origin_city_name']
    files_data_df = pd.merge(files_data_df, city_info, on='origin_city_market_id')
    
    city_info.columns = ['dest_city_market_id', 'dest_city_name']
    files_data_df = pd.merge(files_data_df, city_info, on='dest_city_market_id')
    
    del city_info
    
    return files_data_df

In [76]:
def load_data(folder_name, dict_folder_name):
    files_data = gather_data(folder_name)
    files_data = files_data.sort_values(by='fl_date')
    files_data = files_data.dropna(thresh=9)
    files_data = append_city_names(files_data, dict_folder_name)
    return files_data.reindex()

In [156]:
data = load_data(DATA_FOLDER, DICT_FOLDER)

VBox(children=(HTML(value=u''), IntProgress(value=0, max=0)))

In [157]:
data.to_csv(DATA_FILE)

### Loading weather data

In [158]:
def get_data_city_names(data_df):
    return list(set(list(data['origin_city_name'].values) + list(data['dest_city_name'].values)))

In [159]:
def load_city_codes(dict_folder_name):
    tmp = pd.read_csv(dict_folder_name + '/weather_city_codes_info.csv')
    tmp['name'] = tmp['name'].str.lower()
    tmp['name'] = tmp['name'].str.replace(',.*|\/.*| city', '')
    tmp['name'] = tmp['name'].str.strip()
    return tmp

In [160]:
def get_weather_city_codes(data_df, dict_folder_name):
    weather_cities_codes = load_city_codes(dict_folder_name)
    data_city_names = get_data_city_names(data_df)
    
    processed_cities = []
    failed_cities = []
    for city in data_city_names:
        city_data = weather_cities_codes[weather_cities_codes['name'].str.contains(city.lower())]
        if len(city_data) == 0:
            failed_cities.append(city)
        else:
            processed_cities.append({'name': city, 'weather_id': city_data['id'].values[0]})
    return processed_cities, failed_cities

In [161]:
def get_weather_for_city(city_info, start_date, end_date, api_key, limit=1000, offset=0):
    req_url = 'http://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&locationid={0}&startdate={1}&enddate={2}&limit={3}&offset={4}'
    req_url = req_url.format(city_info['weather_id'], start_date, end_date, limit, offset)
    
    print(city_info['weather_id'])
    result_json = requests.get(req_url, headers={'token': api_key}, timeout=20)
    result_json = json.loads(result_json.content)
    result_data = pd.DataFrame(result_json['results'])
    
    if result_json['metadata']['resultset']['count'] > offset + limit:
        return pd.concat([result_data, get_weather_for_city(city_info, start_date, end_date, api_key, limit, offset + limit)])
    else:
        return result_data

In [162]:
def save_weather_data(data_df, dict_folder_name, save_folder, start_date, end_date, api_key):
    weather_city_codes, error_cities = get_weather_city_codes(data_df, dict_folder_name)
    
    for city in log_progress(weather_city_codes, every=1):
        try:
            city_weather = get_weather_for_city(city, start_date, end_date, api_key)
            city_weather.to_csv(save_folder + '/' + city['name'] + '.csv')
        except Exception as e:
            error_cities.append(city)
            print(e)
    return weather_city_codes, error_cities

In [44]:
ok, err = save_weather_data(data, DICT_FOLDER, WEATHER_FOLDER, STARTING_DATE, END_DATE, WEATHER_API_KEY)

VBox(children=(HTML(value=u''), IntProgress(value=0, max=223)))

CITY:US370018
CITY:US370018
CITY:US370018
CITY:US370018
CITY:US370018
HTTPSConnectionPool(host='www.ncdc.noaa.gov', port=443): Read timed out. (read timeout=20)
CITY:US130001
CITY:US130001
CITY:US130001
CITY:US130001
CITY:US130001
CITY:US130001
HTTPSConnectionPool(host='www.ncdc.noaa.gov', port=443): Read timed out. (read timeout=20)
CITY:US260004
CITY:US260004
CITY:US260004
CITY:US260004
CITY:US260004
CITY:US260004
CITY:US010015
CITY:US010015
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US010015&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=1000 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd3b19750>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
CITY:US480001
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US480001&startdate

HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US120025&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd41f6210>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US210008
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US210008&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd41f6950>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US130005
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US130005&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Cau

CITY:US300006
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US300006&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd3af8910>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US420001
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US420001&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd3af8d10>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US470010
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US470010&startdate=2017-03-01&enddate=2018-03-01&limit=1000

HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US320004&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd3af8f10>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US260014
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US260014&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd4127090>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US420007
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US420007&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Cau

HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US300001&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd2f97750>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US490002
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US490002&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd2f978d0>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US160009
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US160009&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Cau

HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US220016&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd41234d0>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US010004
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US010004&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd41233d0>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US490007
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US490007&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Cau

CITY:US010012
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US010012&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd3b19710>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US470016
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US470016&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd41f6e90>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US480002
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US480002&startdate=2017-03-01&enddate=2018-03-01&limit=1000

HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US350009&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd3af8910>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US320006
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US320006&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8fd4123bd0>: Failed to establish a new connection: [Errno -2] Name or service not known',))
CITY:US360009
HTTPConnectionPool(host='www.ncdc.noaa.gov', port=80): Max retries exceeded with url: /cdo-web/api/v2/data?datasetid=GHCND&locationid=CITY:US360009&startdate=2017-03-01&enddate=2018-03-01&limit=1000&offset=0 (Cau

In [163]:
def filter_data_by_available_weather(data_df, weather_data_folder, prefix=''):
    city_list = get_file_list(weather_data_folder, prefix)
    fixed_city_list = []
    for city in city_list:
        fixed_city_list.append(city.replace(weather_data_folder + '/', '').replace('.csv', '').replace(prefix, ''))
    
    tmp_df = data_df[(data_df['origin_city_name'].isin(fixed_city_list)) | (data_df['dest_city_name'].isin(fixed_city_list))]
    return tmp_df, city_list 

In [164]:
def read_weather_file(weather_data_folder, weather_file_path, prefix=''):
    weather_data = pd.read_csv(weather_file_path)
    weather_data['date'] = pd.to_datetime(weather_data.date)
    weather_data = weather_data.drop(['Unnamed: 0', 'attributes', 'station'], axis=1)
    weather_data = weather_data.rename(columns={'date': 'fl_date'})
    weather_data = weather_data.drop_duplicates(["fl_date", "datatype"])
    weather_data = weather_data.pivot_table(weather_data, index='fl_date', columns='datatype', aggfunc=sum, fill_value=0)
    weather_data = weather_data.reset_index()
    weather_data_value = weather_data['value'].copy()
    weather_data_value['fl_date'] = weather_data['fl_date']
    del weather_data
    selected_cols = ['fl_date', 'AWND', 'SNOW', 'SNWD', 'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT07', 'WT08', 'WT09', 'WT10', 'WT11']
    available_cols = list(set(weather_data_value.columns).intersection(selected_cols))
    na_cols = list(set(selected_cols) - set(available_cols))
    weather_data_value = weather_data_value[available_cols]
    
    for na_col in na_cols:
        weather_data_value[na_col] = None
    
    weather_data_value = weather_data_value.rename({'SNOW': 'snowfall_mm', 'SNWD': 'snow_depth_mm', 'AWND': 'avg_wind_ms', 'WT08': 'smoke', 'WT01': 'fog', 'WT03': 'thunder'})
    weather_data_value['city_name'] = weather_file_path.replace(weather_data_folder + '/', '').replace('.csv', '').replace(prefix, '')
    
    return weather_data_value

In [165]:
def merge_data_with_weather(data_df, weather_data_folder, prefix=''):
    filtered_df, city_list = filter_data_by_available_weather(data_df, weather_data_folder, prefix=prefix)
    dfs = []
    
    for city in log_progress(city_list, every=1):
        tmp = read_weather_file(weather_data_folder, city, prefix=prefix)
        tmp = tmp.rename(columns={'city_name': 'origin_city_name'})
        dfs.append(pd.merge(filtered_df, tmp, on=['fl_date', 'origin_city_name']))
        tmp = tmp.rename(columns={'origin_city_name': 'dest_city_name'})
        dfs.append(pd.merge(filtered_df, tmp, on=['fl_date', 'dest_city_name']))
        del tmp
        gc.collect()
        
    final_df = pd.concat(dfs)
    final_df = final_df.sort_values(by=['fl_date', 'origin_city_name', 'dest_city_name'])
    return final_df

In [166]:
data2016 = merge_data_with_weather(data, WEATHER_FOLDER, prefix='2016')

VBox(children=(HTML(value=u''), IntProgress(value=0, max=224)))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [167]:
data2017 = merge_data_with_weather(data, WEATHER_FOLDER, prefix='2017')

VBox(children=(HTML(value=u''), IntProgress(value=0, max=222)))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [168]:
data_merged = pd.concat([data2016, data2017])

In [169]:
data_merged = data_merged.sort_values(by=['fl_date', 'origin_city_name', 'dest_city_name'])

In [170]:
data = data_merged

In [171]:
data = data.drop_duplicates(subset=['origin', 'dest', 'op_carrier_fl_num', 'op_unique_carrier', 'year', 'day_of_month', 'day_of_week', 'month'])

In [172]:
len(data)

15315

In [173]:
data.to_csv(DATA_FILE)

## Preprocessing data 

In [174]:
data = read_csv_backup(DATA_FILE)

In [175]:
def get_dependent_variable_value(cancelled, cancelled_code, dep_delay_new):
    if cancelled == 1:
        return 'cancelled_flight'
    if dep_delay_new > 30:
        return 'delay'
    return 'no_delay'

In [179]:
def preprocess_data(file_data):
    data_df = file_data.copy()
    data_df[['WT01', 'WT02', 'WT04', 'WT05', 'WT10', 'WT11']] = data_df[['WT01', 'WT02', 'WT04', 'WT05', 'WT10', 'WT11']].fillna(value=0)
    data_df = data_df.drop(['origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'year',  'dep_time'], axis=1)
    data_df = data_df.fillna(value={'cancellation_code': 'E'})
    data_df['fog'] = list(map(int , (data_df['WT01'] + data_df['WT02']).values > 0))
    data_df['hail'] = list(map(int , (data_df['WT04'] + data_df['WT05']).values > 0))
    data_df['damaging_wind'] = list(map(int , (data_df['WT10'] + data_df['WT11']).values > 0))
    data_df = data_df.drop(['WT01', 'WT02', 'WT04', 'WT05', 'WT06', 'WT10', 'WT11'], axis=1)
    data_df = data_df.rename(columns={'SNOW': 'snowfall', 'SNWD': 'snow_depth', 'AWND': 'average_wind_speed','WT03': 'thunder', 'WT07': 'dust', 'WT08': 'haze', 'WT09': 'snow'})
    data_df['cancellation_code'] = list(map(str, data_df['cancellation_code']))
    data_df['op_unique_carrier'] = list(map(str, data_df['op_unique_carrier']))
    data_df['dest'] = list(map(str, data_df['dest']))
    data_df['dest_city_name'] = list(map(str, data_df['dest_city_name']))
    data_df['origin'] = list(map(str, data_df['origin']))
    data_df['origin_city_name'] = list(map(str, data_df['origin_city_name']))
    data_df['cancelled'] = list(map(int, data_df['cancelled']))
    data_df['diverted'] = list(map(int, data_df['diverted']))
    
    with concurrent.futures.ProcessPoolExecutor(16) as pool:
        data_df['status'] = list(pool.map(get_dependent_variable_value, data_df['cancelled'], data_df['cancellation_code'], data_df['dep_delay'], chunksize=1000))
    return data_df

In [180]:
def save_file(file_name, data_to_save):
    n_bytes = 2**31
    max_bytes = 2**31 - 1
    bytes_out = pickle.dumps(data_to_save)
    with open(file_name, 'w+b') as f_out:
        for idx in range(0, n_bytes, max_bytes):
            f_out.write(bytes_out[idx:idx+max_bytes])

In [181]:
data = preprocess_data(data)

Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
  

    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/mul

    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/mul

  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Bro

IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/q

    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/mul

    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/mul

    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
    send(obj)
IOError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/aaditya/anaconda2/lib/python2.7/mul

In [182]:
save_file(DATA_PICKLE, data)