In [1]:
import numpy as np
import pandas as pd
from geopy.distance import vincenty

In [2]:
data = pd.DataFrame.from_csv("../train.csv")
test = pd.DataFrame.from_csv("../test.csv")
X = data[data.columns[:len(data.columns)-1]]
train_len = len(X)

In [3]:
centers = [
    [55.754216, 37.61343],  #0
    [55.798551, 49.106324], #3
    [51.661535, 39.200287], #1
    [56.326887, 44.005986], #2
    [59.939095, 30.315868]  #4
]

minminmin = 10**15

def city_id(coords):
    mn = 0
    min_value = minminmin
    for i in range(len(centers)):
        
        dist = vincenty(coords, centers[i]).km
        if (dist < min_value):
            min_value = dist
            mn = i
    if (min_value > 10**3):
        return -1
    return mn

In [4]:
XV = X.values
X['city_id'] = [city_id([x[5], x[6]]) for x in XV]

In [5]:
testV = test.values
test['city_id'] = [city_id([x[3], x[4]]) for x in testV]

In [6]:
x_col = ['dist', 'due', 'lat', 'lon', 'f_class', 's_class', 't_class', 'city_id']
X_and_test = pd.concat([X[x_col], test[x_col]])

In [7]:
#Используем данные о праздниках отсюда - http://eduscan.net/help/calendar2014 http://newslab.ru/article/559455

holidays = "1.01,2.01,3.01,4.01,5.01,6.01,7.01,8.01,"\
"23.02,8.03,9.03,10.03,1.05,2.05,3.05,4.05,9.05,10.05,11.05,12.06,13.06".split(',')

holidays = map(lambda s: tuple(map(int,s.split('.'))),holidays)

#вытаскиватель категориальных фичей
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=False,dtype=np.bool)

# используем встроенный питоновый парсер времени
from dateutil import parser

def preprocess_data(X_raw):

    #распарсим время
    datetimes = list(X_raw.due.apply(lambda x: parser.parse(x)))

    #время дня от 0 до 1
    rel_times = map(lambda dt: (dt.hour*60 + dt.minute)/(24.*60), datetimes)

    #день недели
    week_days = map( lambda x: x.isoweekday() , datetimes)

    rel_times = np.array(rel_times)
    week_days = np.array(week_days)
    
    is_holiday = map(lambda dt: (dt.day,dt.month) in holidays,
                 datetimes)

    dow_names = ['mon','tue','wed','thu','fri','sat','sun']
    
    
    data_dict = ( {'city_id':str(city_id), 
                   'f_class':f,
                   's_class':(s if (pd.isnull(s)) else (str(f) + str(s))),
                   't_class':(t if (pd.isnull(t)) else(str(f) + str(s) + str(t))),
                   'day_of_week':dow_names[day_of_week-1]}
                 
            for (city_id,f,s,t),day_of_week in zip(X_raw[['city_id', 
                                                         'f_class',
                                                         's_class',
                                                         't_class']].values, week_days) )

    
    Xcat = vectorizer.fit_transform(data_dict)

    other_features = ["dist","lat","lon"]
    
    Xreal = X_raw[other_features].values
    
    
    Xfull = np.concatenate([
            Xreal,
            Xcat            
        ],axis=1)
    
    Xfull = pd.DataFrame(Xfull,columns=other_features+vectorizer.feature_names_)
    
    Xfull["time_of_day_rel"] = rel_times
    Xfull["is_holiday"] = is_holiday

    
    return Xfull

In [8]:
X_and_test = preprocess_data(X_and_test)

In [11]:
X = X_and_test.head(train_len)
test = X_and_test[train_len:]

In [12]:
X.to_csv('X_preprocessed.csv')
test.to_csv('test_preprocessed.csv')