In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

In [15]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)
    
    storeTypes = pd.get_dummies(data['StoreType'], prefix='StoreType')
    features.extend(storeTypes.columns)
    data = data.join(storeTypes)
    
    assortments = pd.get_dummies(data['Assortment'], prefix='Assortment')
    features.extend(assortments.columns)
    data = data.join(assortments)
    
    stateHolidays = pd.get_dummies(data['StateHoliday'], prefix='StateHoliday')
    features.extend(stateHolidays.columns)
    data = data.join(stateHolidays)
    
    
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    years = pd.get_dummies(data['Year'], prefix='Year')
    features.extend(years.columns)
    data = data.join(years)
    
    months = pd.get_dummies(data['Month'], prefix='Month')
    features.extend(months.columns)
    data = data.join(months)
    
    days = pd.get_dummies(data['Day'], prefix='Day')
    features.extend(days.columns)
    data = data.join(days)
    
    dayOfWeeks = pd.get_dummies(data['DayOfWeek'], prefix='DayOfWeek')
    features.extend(dayOfWeeks.columns)
    data = data.join(dayOfWeeks)
    
    weekOfYears = pd.get_dummies(data['WeekOfYear'], prefix='WeekOfYear')
    features.extend(weekOfYears.columns)
    data = data.join(weekOfYears)
    

    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1
    return data

                
types = {'CompetitionOpenSinceYear': np.dtype(int),
        'CompetitionOpenSinceMonth': np.dtype(int),
        'StateHoliday': np.dtype(str),
        'Promo2SinceWeek': np.dtype(int),
        'SchoolHoliday': np.dtype(float),
        'PromoInterval': np.dtype(str)}
train = pd.read_csv("../data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../data/store.csv")

train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

train = train[train["Open"] != 0]
train = train[train["Sales"] > 0]

train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

train = build_features(features, train)
test = build_features([], test)
for feature in features:
    if feature not in test.columns:
        test[feature] = 0

In [16]:
list(train[features].columns)

['CompetitionDistance',
 'Promo',
 'Promo2',
 'SchoolHoliday',
 'StoreType_1',
 'StoreType_2',
 'StoreType_3',
 'StoreType_4',
 'Assortment_1',
 'Assortment_2',
 'Assortment_3',
 'StateHoliday_0',
 'StateHoliday_1',
 'StateHoliday_2',
 'StateHoliday_3',
 'Year_2013',
 'Year_2014',
 'Year_2015',
 'Month_1',
 'Month_2',
 'Month_3',
 'Month_4',
 'Month_5',
 'Month_6',
 'Month_7',
 'Month_8',
 'Month_9',
 'Month_10',
 'Month_11',
 'Month_12',
 'Day_1',
 'Day_2',
 'Day_3',
 'Day_4',
 'Day_5',
 'Day_6',
 'Day_7',
 'Day_8',
 'Day_9',
 'Day_10',
 'Day_11',
 'Day_12',
 'Day_13',
 'Day_14',
 'Day_15',
 'Day_16',
 'Day_17',
 'Day_18',
 'Day_19',
 'Day_20',
 'Day_21',
 'Day_22',
 'Day_23',
 'Day_24',
 'Day_25',
 'Day_26',
 'Day_27',
 'Day_28',
 'Day_29',
 'Day_30',
 'Day_31',
 'DayOfWeek_0',
 'DayOfWeek_1',
 'DayOfWeek_2',
 'DayOfWeek_3',
 'DayOfWeek_4',
 'DayOfWeek_5',
 'DayOfWeek_6',
 'WeekOfYear_1',
 'WeekOfYear_2',
 'WeekOfYear_3',
 'WeekOfYear_4',
 'WeekOfYear_5',
 'WeekOfYear_6',
 'WeekOfYea

In [22]:
def toVWFile(data, features, target, fileName):
    f = open(fileName, 'wb')
    for index, row in data.iterrows():
        if target is not None:
            f.write(str(target[index]))
        f.write(" | ")
        for feature in features:
            f.write(feature)
            f.write(":")
            f.write(str(row[feature]))
            f.write(" ")
        f.write("\n")
        
# toVWFile(train, features, train.Sales, "../data/vw_train")
toVWFile(test, features, None, "../data/vw_test")