## Data munging starts: 
### Importing packages and data

In [210]:
import datetime
import numpy as np
from numpy import double
import pandas as pd

In [211]:
print("Loading data start...")
# train and test set provided by Rossmann
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
print("Complete!")

Loading data start...
Complete!


In [212]:
print("Loading data start...")
# external data
store = pd.read_csv("input/store.csv")
states = pd.read_csv("input/store_states.csv")
eco_data = pd.read_csv("input/economic_data.csv")
print("Complete!")

Loading data start...
Complete!


## Data munging part I:
### Clean up, merges and dates features

In [213]:
# 1: Impute Open = 1 for store 622 in test data
test['Open'].fillna(1, inplace=True)
print 'Step 1 Complete.'

Step 1 Complete.


In [214]:
# 2: Merging the train and test for easy implementation of features
train['type'] = 'train'
test['type'] = 'test'
all_data = [train, test]
all_data = pd.concat(all_data)
print 'Step 2 Complete'

Step 2 Complete


In [215]:
# 3: Separate date columns
all_data['year']  = all_data.Date.apply(lambda x: x.split('-')[0])
all_data['year']  = all_data['year'].astype(int)
all_data['month'] = all_data.Date.apply(lambda x: x.split('-')[1])
all_data['month'] = all_data['month'].astype(int)
all_data['day']   = all_data.Date.apply(lambda x: x.split('-')[2])
all_data['day']   = all_data['day'].astype(int)
print 'Step 3 Complete.'

Step 3 Complete.


In [216]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,type,year,month,day
0,555,2015-07-31,5,,1,1,5263,1,0,1,train,2015,7,31


In [217]:
# 4: Withdraw multiple date features
all_data['Date']             = pd.to_datetime(all_data['Date'])
all_data['day_of_year']      = all_data['Date'].dt.dayofyear
all_data['quarter']          = all_data['Date'].dt.quarter
all_data['is_month_start']   = all_data['Date'].dt.is_month_start.astype(int)
all_data['is_month_end']     = all_data['Date'].dt.is_month_end.astype(int)
all_data['is_quarter_start'] = all_data['Date'].dt.is_quarter_start.astype(int)
all_data['is_quarter_end']   = all_data['Date'].dt.is_quarter_end.astype(int)
all_data['weeknum']          = np.ceil(all_data['day_of_year']/7.)
print 'Step 4 Complete.'

Step 4 Complete.


In [218]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,year,month,day,day_of_year,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,weeknum
0,555,2015-07-31,5,,1,1,5263,1,0,1,...,2015,7,31,212,3,0,1,0,0,31


In [219]:
# 5: Standardize StateHoliday column.
all_data.ix[(all_data['StateHoliday'] == 0), 'StateHoliday'] = '0'
print 'Step 5 Complete.'

Step 5 Complete.


In [220]:
# 6: Setting 'Open' to 0 if Sales are 0
all_data.ix[(all_data['Sales'] == 0), 'Open'] = 0
print 'Step 6 Complete.'

Step 6 Complete.


In [221]:
# 7: Create duration of promo 1
all_data = all_data.sort(columns = ['Store', 'Date'], ascending = True, na_position = 'last')
all_data['PromoFirstDate'] = 0
all_data.ix[((all_data['Store'] == all_data['Store'].shift(1)) &
          (all_data['Promo'] == 1 ) & (all_data['Promo'].shift(1) ==0)),
         'PromoFirstDate'] = 1
print 'Step 7 Complete.'

Step 7 Complete.


In [222]:
# 8: Joining States:
all_data = pd.merge(all_data, states, on = 'Store', how = 'inner')
print 'Step 8 Complete.'

Step 8 Complete.


In [223]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,day,day_of_year,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,weeknum,PromoFirstDate,State
0,0,2013-01-01,2,,0,0,0,1,a,1,...,1,1,1,1,0,1,0,1,0,HE


## Data munging part II: 
### Adding some usefull functions:

In [224]:
# f1: Creating a function to summarize our data
def rstr(df): 
    print df.shape
    print '=' * 72
    print df.apply(lambda x: [x.unique()])
    print '=' * 72
    print pd.isnull(df).any()
print 'function 1: rstr added'

function 1: rstr added


In [225]:
# f2: Creating a function to shift columns and making sure to distinguish between stores
def shift_col(df, col_name, n):
    df[col_name + str(n)] = df[col_name].shift(n)
    df.ix[(df['Store'] != df['Store'].shift(n)), col_name + str(n)] = float('NaN')
print 'function 2: shif_col added'    

function 2: shif_col added


In [226]:
# f2.1: Demonstrating what shift_col does:
# temp = pd.DataFrame({
#         'Sales': [1,2,3,4,5,1,2,3,4,5],
#         'Store': [1,1,1,1,1,2,2,2,2,2]
#     })
# shift_col(temp, 'Sales', 2)
# shift_col(temp, 'Sales', -1)
# temp

## Data munging part III: 
### Adding Schedule BUT NO historic Sales instead we add avg sales statistics

In [227]:
# 9: now we can add previous sales for the past 21 days:
# for i in range(1,22):
#     shift_col(all_data, 'Sales', i)
# print 'Step 9 Complete.'

In [228]:
# 9: now we can add previous sales for the past 21 days:
# Creating our sales statistics
temp = pd.Series.to_frame(pd.groupby(all_data, ['Store'])['Sales'].mean(), name = 'Sales_all_avg').reset_index()
temp1 = pd.Series.to_frame(pd.groupby(all_data, ['Store','month'])['Sales'].mean(), name = 'Sales_month_avg').reset_index()
temp2 = pd.Series.to_frame(pd.groupby(all_data, ['Store','DayOfWeek'])['Sales'].mean(), name = 'Sales_day_avg').reset_index()
temp3 = pd.Series.to_frame(pd.groupby(all_data, ['Store','weeknum'])['Sales'].mean(), name = 'Sales_week_avg').reset_index()

# Merging our sales statistics
all_data = pd.merge(all_data, temp, on = ['Store'], how = 'left')
all_data = pd.merge(all_data, temp1, on = ['Store','month'], how = 'left')
all_data = pd.merge(all_data, temp2, on = ['Store','DayOfWeek'], how = 'left')
all_data = pd.merge(all_data, temp3, on = ['Store','weeknum'], how = 'left')

print 'Step 9 Complete.'

Step 9 Complete.


In [229]:
all_data.tail(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,is_month_end,is_quarter_start,is_quarter_end,weeknum,PromoFirstDate,State,Sales_all_avg,Sales_month_avg,Sales_day_avg,Sales_week_avg
1058296,,2015-09-17,4,856,1,1,,0,0,1115,...,0,0,0,38,0,HE,5225.296178,4931.616667,5396.607407,4468.071429


In [230]:
# all_data[all_data['Store']==1][['Store','month','day','weeknum','Sales_all_avg','Sales_month_avg','Sales_day_avg','Sales_week_avg']]

In [231]:
# 10: now we can add previous open for the past 14 days:
for i in range(1,15):
    shift_col(all_data, 'Open', i)
print 'Step 10 Complete.'

Step 10 Complete.


In [232]:
# 11: now we can add next open for the past 14 days:
for i in range(1,15):
    shift_col(all_data, 'Open', -i)
print 'Step 11 Complete.'

Step 11 Complete.


In [233]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,Open-5,Open-6,Open-7,Open-8,Open-9,Open-10,Open-11,Open-12,Open-13,Open-14
0,0,2013-01-01,2,,0,0,0,1,a,1,...,0,1,1,1,1,1,1,0,1,1


## Data munging part IV: 
### Adding all weather data with historic and forecast

In [234]:
# 12: loading all weather data into one dataFrame called weather:
weather = pd.DataFrame()
for i in all_data['State'].unique():
    i = 'NI' if i == 'HB,NI' else i
    temp = pd.read_csv("data_for_features/Data_Weather/" + str(i) + ".csv", sep = ";")

    i = 'HB,NI' if i == 'NI' else i
    temp['State'] = i
    
    temp = [weather, temp]
    weather = pd.concat(temp)

weather['Date'] = pd.to_datetime(weather['Date'])
print 'Step 12 Complete.'

Step 12 Complete.


In [235]:
# 12.1: Checking that step 12 was ok especially for state 'HB,NI'
# weather[weather['State']=='HB,NI']

In [236]:
weather.head(1)

Unnamed: 0,Date,Max_TemperatureC,Mean_TemperatureC,Min_TemperatureC,Dew_PointC,MeanDew_PointC,Min_DewpointC,Max_Humidity,Mean_Humidity,Min_Humidity,...,Mean_VisibilityKm,Min_VisibilitykM,Max_Wind_SpeedKm_h,Mean_Wind_SpeedKm_h,Max_Gust_SpeedKm_h,Precipitationmm,CloudCover,Events,WindDirDegrees,State
0,2013-01-01,8,6,3,6,3,1,93,80,59,...,12,10,23,14,39,2.03,6,Rain,206,HE


In [237]:
# 13: Merging the weather data to our dataset:
all_data = pd.merge(all_data, weather, on = ['State', 'Date'], how = 'left')
print 'Step 13 Complete.'

Step 13 Complete.


In [238]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,Max_VisibilityKm,Mean_VisibilityKm,Min_VisibilitykM,Max_Wind_SpeedKm_h,Mean_Wind_SpeedKm_h,Max_Gust_SpeedKm_h,Precipitationmm,CloudCover,Events,WindDirDegrees
0,0,2013-01-01,2,,0,0,0,1,a,1,...,31,12,10,23,14,39,2.03,6,Rain,206


In [239]:
# 14: Adding some weather historic on mean temperature, max wind speed, precipitation and events:
# last 14 days
for i in range(1,15):
    shift_col(all_data, 'Mean_TemperatureC', i)

# next 3 days (forecast)
for i in range(1,4):
    shift_col(all_data, 'Mean_TemperatureC', -i)

print 'Step 14 Complete.'

Step 14 Complete.


In [240]:
# 15: Adding some weather historic on mean temperature, max wind speed, precipitation and events:
# last 14 days
for i in range(1,8):
    shift_col(all_data, 'Max_Wind_SpeedKm_h', i)

# next 3 days (forecast)
for i in range(1,4):
    shift_col(all_data, 'Max_Wind_SpeedKm_h', -i)

print 'Step 15 Complete.'

Step 15 Complete.


In [241]:
# 16: Adding some weather historic on mean temperature, max wind speed, precipitation and events:
# last 14 days
for i in range(1,8):
    shift_col(all_data, 'Precipitationmm', i)

# next 3 days (forecast)
for i in range(1,4):
    shift_col(all_data, 'Precipitationmm', -i)

print 'Step 16 Complete.'

Step 16 Complete.


In [242]:
# 17: Adding some weather historic on mean temperature, max wind speed, precipitation and events:
# last 14 days
for i in range(1,15):
    shift_col(all_data, 'Events', i)

# next 3 days (forecast)
for i in range(1,4):
    shift_col(all_data, 'Events', -i)

print 'Step 17 Complete.'

Step 17 Complete.


In [243]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,Events8,Events9,Events10,Events11,Events12,Events13,Events14,Events-1,Events-2,Events-3
0,0,2013-01-01,2,,0,0,0,1,a,1,...,,,,,,,,,Rain,Rain


## Data munging part V: 
### Adding all economic data

In [244]:
# 18: coverting date to match our all_data 'Date' otherwise merging crashes
eco_data['Date'] = pd.to_datetime(eco_data['Date'])
print 'Step 18 Complete'

Step 18 Complete


In [245]:
eco_data.head(1)

Unnamed: 0,Date,DAX,Merck,MSCI,CLeadIndic,BusConf,ConsConf,SP_POP_TOTL,SP_POP_GROW,EN_POP_DNST,...,IT_NET_USER_P2,TG_VAL_TOTL_GD_ZS,BX_TRF_PWKR_CD_DT,BX_KLT_DINV_CD_WD,HHDEBT,HHFA,HHDI,HHFT,HHSAVFORECAST,HHSAV
0,2013-01-01,7612.390137,99.83,24.4,99.22213,99.83537,99.86818,80425823,-1.691349,230.750625,...,82.349998,72.69033,16433900000.0,54659880000.0,95.47638,39.5527,0.623393,8.310776,9.13566,9.260378


In [246]:
# 19: merging all eco data to our data
all_data = pd.merge(all_data, eco_data, on = ['Date'], how = 'left')
print 'Step 19 Complete'

Step 19 Complete


In [247]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,IT_NET_USER_P2,TG_VAL_TOTL_GD_ZS,BX_TRF_PWKR_CD_DT,BX_KLT_DINV_CD_WD,HHDEBT,HHFA,HHDI,HHFT,HHSAVFORECAST,HHSAV
0,0,2013-01-01,2,,0,0,0,1,a,1,...,82.349998,72.69033,16433900000.0,54659880000.0,95.47638,39.5527,0.623393,8.310776,9.13566,9.260378


In [248]:
# 20: loading all google trend data into one dataFrame called gooogle_trend:
google_trend = pd.DataFrame()
for i in all_data['State'].unique():
    i = 'NI' if i == 'HB,NI' else i
    temp = pd.read_csv("data_for_features/Data_google_trends/" + str(i) + ".csv", sep = ";")

    i = 'HB,NI' if i == 'NI' else i
    temp['State'] = i
    
    temp = [google_trend, temp]
    google_trend = pd.concat(temp)

google_trend['Date'] = pd.to_datetime(google_trend['Date'])
all_data = pd.merge(all_data, google_trend, on = ['State', 'Date'], how = 'left')
print 'Step 20 Complete.'

Step 20 Complete.


In [249]:
# 21: adding the global google trend too:
google_trend_all = pd.read_csv("data_for_features/Data_google_trends/ALL.csv", sep = ";")
google_trend_all['Date'] = pd.to_datetime(google_trend_all['Date'])
all_data = pd.merge(all_data, google_trend_all, on = ['Date'], how = 'left')
print 'Step 21 Complete.'

Step 21 Complete.


In [250]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,BX_TRF_PWKR_CD_DT,BX_KLT_DINV_CD_WD,HHDEBT,HHFA,HHDI,HHFT,HHSAVFORECAST,HHSAV,Goog_trend,Goog_trend_ALL
0,0,2013-01-01,2,,0,0,0,1,a,1,...,16433900000.0,54659880000.0,95.47638,39.5527,0.623393,8.310776,9.13566,9.260378,61,61


## Data munging part VI: 
### Adding more features

In [251]:
# 22: adding promo second date
all_data['PromoSecondDate'] = 0
all_data.ix[((all_data['Store'] == all_data['Store'].shift(2)) & 
             (all_data['Promo'] == 1 ) & 
             (all_data['PromoFirstDate'].shift(1) ==1)), 'PromoSecondDate'] = 1
print 'Step 22 Complete.'

Step 22 Complete.


In [252]:
# 23: Adding refurbishement features:
all_data['DayBeforeRefurb'] = 0
all_data.ix[(
        ((all_data['Store'] == all_data['Store'].shift(-1)) 
         & (all_data['Store'] == all_data['Store'].shift(-2))
         & (all_data['Store'] == all_data['Store'].shift(-3)) 
         & (all_data['Store'] == all_data['Store'].shift(-4)) 
         & (all_data['Store'] == all_data['Store'].shift(-5))) & 
        ((all_data['Open'] == 1 ) 
         & (all_data['Open'].shift(-1) ==0) 
         & (all_data['Open'].shift(-2) ==0)
         & (all_data['Open'].shift(-3) ==0)
         & (all_data['Open'].shift(-4) ==0)
         & (all_data['Open'].shift(-5) ==0))
    ), 'DayBeforeRefurb'] = 1

all_data['DayAfterRefurb'] = 0
all_data.ix[(
        ((all_data['Store'] == all_data['Store'].shift(1)) 
         & (all_data['Store'] == all_data['Store'].shift(2))
         & (all_data['Store'] == all_data['Store'].shift(3)) 
         & (all_data['Store'] == all_data['Store'].shift(4)) 
         & (all_data['Store'] == all_data['Store'].shift(5))) & 
        ((all_data['Open'] == 1 ) 
         & (all_data['Open'].shift(1) ==0) 
         & (all_data['Open'].shift(2) ==0)
         & (all_data['Open'].shift(3) ==0)
         & (all_data['Open'].shift(4) ==0)
         & (all_data['Open'].shift(5) ==0))
    ), 'DayAfterRefurb'] = 1
print 'Step 23 Complete.'

Step 23 Complete.


In [253]:
# 24: Adding more refurbishement features:
all_data['DaysBeforeRefurb'] = 0
all_data.ix[(all_data['DayBeforeRefurb'].shift(0) ==1), 'DaysBeforeRefurb'] = 1
all_data.ix[(all_data['DayBeforeRefurb'].shift(-1) ==1), 'DaysBeforeRefurb'] = 2
all_data.ix[(all_data['DayBeforeRefurb'].shift(-2) ==1), 'DaysBeforeRefurb'] = 3
all_data.ix[(all_data['DayBeforeRefurb'].shift(-3) ==1), 'DaysBeforeRefurb'] = 4
all_data.ix[(all_data['DayBeforeRefurb'].shift(-4) ==1), 'DaysBeforeRefurb'] = 5
all_data.ix[(all_data['DayBeforeRefurb'].shift(-5) ==1), 'DaysBeforeRefurb'] = 6
all_data.ix[(all_data['DayBeforeRefurb'].shift(-6) ==1), 'DaysBeforeRefurb'] = 7
all_data.ix[(all_data['DayBeforeRefurb'].shift(-7) ==1), 'DaysBeforeRefurb'] = 8
all_data.ix[(all_data['DayBeforeRefurb'].shift(-8) ==1), 'DaysBeforeRefurb'] = 9
all_data.ix[(all_data['DayBeforeRefurb'].shift(-9) ==1), 'DaysBeforeRefurb'] = 10
all_data.ix[(all_data['DayBeforeRefurb'].shift(-10) ==1), 'DaysBeforeRefurb'] = 11
all_data.ix[(all_data['DayBeforeRefurb'].shift(-11) ==1), 'DaysBeforeRefurb'] = 12
all_data.ix[(all_data['DayBeforeRefurb'].shift(-12) ==1), 'DaysBeforeRefurb'] = 13
all_data.ix[(all_data['DayBeforeRefurb'].shift(-13) ==1), 'DaysBeforeRefurb'] = 14
all_data.ix[(all_data['DayBeforeRefurb'].shift(-14) ==1), 'DaysBeforeRefurb'] = 15

all_data['DaysAfterRefurb'] = 0
all_data.ix[(all_data['DayAfterRefurb'].shift(0) ==1), 'DaysAfterRefurb'] = 1
all_data.ix[(all_data['DayAfterRefurb'].shift(1) ==1), 'DaysAfterRefurb'] = 2
all_data.ix[(all_data['DayAfterRefurb'].shift(2) ==1), 'DaysAfterRefurb'] = 3
all_data.ix[(all_data['DayAfterRefurb'].shift(3) ==1), 'DaysAfterRefurb'] = 4
all_data.ix[(all_data['DayAfterRefurb'].shift(4) ==1), 'DaysAfterRefurb'] = 5
all_data.ix[(all_data['DayAfterRefurb'].shift(5) ==1), 'DaysAfterRefurb'] = 6
all_data.ix[(all_data['DayAfterRefurb'].shift(6) ==1), 'DaysAfterRefurb'] = 7
all_data.ix[(all_data['DayAfterRefurb'].shift(7) ==1), 'DaysAfterRefurb'] = 8
all_data.ix[(all_data['DayAfterRefurb'].shift(8) ==1), 'DaysAfterRefurb'] = 9
all_data.ix[(all_data['DayAfterRefurb'].shift(9) ==1), 'DaysAfterRefurb'] = 10
all_data.ix[(all_data['DayAfterRefurb'].shift(10) ==1), 'DaysAfterRefurb'] = 11
all_data.ix[(all_data['DayAfterRefurb'].shift(11) ==1), 'DaysAfterRefurb'] = 12
all_data.ix[(all_data['DayAfterRefurb'].shift(12) ==1), 'DaysAfterRefurb'] = 13
all_data.ix[(all_data['DayAfterRefurb'].shift(13) ==1), 'DaysAfterRefurb'] = 14
all_data.ix[(all_data['DayAfterRefurb'].shift(14) ==1), 'DaysAfterRefurb'] = 15
print 'Step 24 Complete.'

Step 24 Complete.


In [254]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,HHFT,HHSAVFORECAST,HHSAV,Goog_trend,Goog_trend_ALL,PromoSecondDate,DayBeforeRefurb,DayAfterRefurb,DaysBeforeRefurb,DaysAfterRefurb
0,0,2013-01-01,2,,0,0,0,1,a,1,...,8.310776,9.13566,9.260378,61,61,0,0,0,0,0


## Data munging part V: 
### Merging with Store data and creating sub training for ML that requires no NaN's

In [255]:
# 25: loading and merging store data
store_data = pd.read_csv("input/store.csv")
all_data = pd.merge(all_data, store_data, on = ['Store'], how = 'left')
print 'Step 25 Complete'

Step 25 Complete


In [256]:
all_data.head(1)

Unnamed: 0,Customers,Date,DayOfWeek,Id,Open,Promo,Sales,SchoolHoliday,StateHoliday,Store,...,DaysAfterRefurb,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,0,2013-01-01,2,,0,0,0,1,a,1,...,0,c,a,1270,9,2008,0,,,


In [257]:
# 26: Removing the 14 first observations of each store so we can keep a lot of our features that rely
#     on historical data like schedule and weather info:
all_data_no_NA = all_data[(all_data['year']!=2013) | (all_data['month']!=1) | (all_data['day']>15)]
print 'Step 26 Complete'

Step 26 Complete


In [258]:
# 27: Creating array with only no_NaN columns:
no_NaN = pd.isnull(all_data_no_NA).any().to_frame().reset_index()
no_NaN.columns = ['Col_Name','has_nan']
no_NaN = no_NaN[no_NaN['has_nan']==False]['Col_Name'].values.tolist()
print 'Step 27 Complete'

Step 27 Complete


In [259]:
# 28: Now let's keep all non-NA's columns and adding back 'Id' for test set:
all_data_no_NA = all_data_no_NA[no_NaN + ['Id']]
print 'Step 28 Complete'

Step 28 Complete


## Saving our files:

In [260]:
# 29: Summary of what we have so far:
print '''
We have 2 data frames:
all_data       : has everything we have engineered so far, and will be used for trees mainly
all_data_no_NA : has only rows and columns that have no NA's at all
'''


We have 2 data frames:
all_data       : has everything we have engineered so far, and will be used for trees mainly
all_data_no_NA : has only rows and columns that have no NA's at all



In [261]:
# Create a folder 'data' right outside the folder 'KaggleProject'
all_data[all_data['type']=='train'].to_csv("../data/train.csv", index=False)
all_data[all_data['type']=='test'].to_csv("../data/test.csv", index=False)

all_data_no_NA[all_data_no_NA['type']=='train'].to_csv("../data/train_no_NA.csv", index=False)
all_data_no_NA[all_data_no_NA['type']=='test'].to_csv("../data/test_no_NA.csv", index=False)

print 'Files are saved and ready'

Files are saved and ready
