In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, KBinsDiscretizer
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
import numpy as np
from pprint import pprint
from tqdm.auto import tqdm
from tqdm import tqdm_notebook
from scipy.sparse import hstack
from collections import defaultdict, Counter
from ds_tools.ds_tools import CategoricalTransformer
import pickle
tqdm.pandas()

In [2]:
df_description = pd.read_csv('./data/data_definition.txt', sep='\t')

df_train_genba = pd.read_csv('./data/train_genba.tsv', sep='\t')
df_train_goto = pd.read_csv('./data/train_goto.tsv', sep='\t')

df_train = df_train_goto.merge(df_train_genba, on='pj_no', how='left')

df_test_genba = pd.read_csv('./data/test_genba.tsv', sep='\t')
df_test_goto = pd.read_csv('./data/test_goto.tsv', sep='\t')

df_test = df_test_goto.merge(df_test_genba, on='pj_no', how='left')
test_surface = df_test['tc_mseki']

df_train.drop(['id', 'kaoku_um', 'shu_sogi'], axis=1, inplace=True)
df_test.drop(['id', 'kaoku_um', 'shu_sogi'], axis=1, inplace=True)

continue_features = list(df_description[(df_description['データ型'] == '数値') & (df_description['項目名'] != 'pj_no')]['項目名'])

## Preprocessing

In [3]:
def combine(row, combine_list, tup):
    l = set()
    for col in tup:
        if pd.notnull(row[col]):
            l.add(row[col])
    combine_list.append(','.join(l))


combine_cols = [('yoto', 100), ('road_hk', 100), ('road_sb', 1), ('toshikuiki', 10), ('hokakisei', 10), ('kobetsu', 10)]
for i, tup in enumerate([['yoto1', 'yoto2'], ['road1_hk', 'road2_hk', 'road3_hk', 'road4_hk'], 
                         ['road1_sb', 'road2_sb', 'road3_sb', 'road4_sb'], ['toshikuiki1', 'toshikuiki2'], 
                        ['hokakisei1', 'hokakisei2', 'hokakisei3', 'hokakisei4'],
                         ['kobetsu1', 'kobetsu2', 'kobetsu3', 'kobetsu4']]):
    combine_train = []
    combine_test = []
    
    combine_col_name = combine_cols[i][0]
    _ = df_train.apply(lambda row: combine(row, combine_train, tup), axis=1)
    _ = df_test.apply(lambda row: combine(row, combine_test, tup), axis=1)

    count_vectorizer = CountVectorizer(min_df=combine_cols[i][1])
    combine_train_matrix = count_vectorizer.fit_transform(combine_train).todense()
    combine_test_matrix = count_vectorizer.transform(combine_test).todense()
    for i in range(combine_train_matrix.shape[1]):
        df_train['%s_%d' % (combine_col_name, i)] = combine_train_matrix[:, i]
        df_test['%s_%d' % (combine_col_name, i)] = combine_test_matrix[:, i]
    for col in tup:
        if col not in ['toshikuiki1']:
            df_train.drop(col, axis=1, inplace=True)
            df_test.drop(col, axis=1, inplace=True)

In [4]:
coordinates = pickle.load(open('./data/coordinates.bin', 'rb'))

for df in [df_train, df_test]:
    df['lat'] = df['jukyo'].apply(lambda j: coordinates[j]['results'][0]['geometry']['location']['lat'] if coordinates[j]['results'] else np.nan)
    df['lng'] = df['jukyo'].apply(lambda j: coordinates[j]['results'][0]['geometry']['location']['lng'] if coordinates[j]['results'] else np.nan)
    
def fill_city_name(name):
    if '市' not in name and '郡' not in name:
        name = '市' + name
    return name

def split_address(df):
    df['jukyo'] = df['jukyo'].str.replace(r'[ヶｹ]', 'ケ')
    df['jukyo'] = df['jukyo'].apply(fill_city_name)
    city_split = df['jukyo'].str.split(r'[市郡]', n=1, expand=True)
    df['city'] = city_split[0]
    street_split = city_split[1].str.split(r'[町区]', n=1, expand=True)
    df['street'] = street_split[0]
    df['address_detail'] = street_split[1].str.strip().str.replace('大字', '').replace('', None)
    return df

df_train = split_address(df_train)
df_test = split_address(df_test)

for df in [df_train, df_test]:
    df['station_name_prefix'] = df['rosen_nm1'].str.slice(stop=2)
    df['city_toshikuiki1'] = df['city'] + ' ' + df['toshikuiki1']
    df.drop('toshikuiki1', axis=1, inplace=True)
    
    for col in ['mseki_rd_hb', 'road3_fi', 'rosenka_hb', 'kempei2', 'road2_mg', 'kaoku_hb', 'bus_hon']:
        df[col].replace(0.0, np.nan, inplace=True)
    
for col in ['chiseki_kb_hb', 'magutchi']:
    col_mean = pd.concat([df_train[col], df_test[col]]).mean()
    df_train[col] = df_train[col].fillna(col_mean)
    df_test[col] = df_test[col].fillna(col_mean)

for col in ['chiseki_js_hb', 'tc_mseki_min_hb', 'chiseki_kb_hb', 'tc_mseki', 'magutchi']:
    binning = KBinsDiscretizer(n_bins=30, encode='ordinal', strategy='uniform')
    df_train[col+'_bins'] = binning.fit_transform(df_train[col].values.reshape(-1, 1))
    df_test[col+'_bins'] = binning.transform(df_test[col].values.reshape(-1, 1))

for df in [df_train, df_test]:
    df['tc_mseki_bins_road_st'] = df['tc_mseki_bins'].astype(str) + ' ' + df['road_st']
    
def make_percentage_features(numeric_feature, categorical_feature):
    global continue_features
    df_combined = pd.concat([df_train, df_test], sort=True).reset_index(drop=True)
    mean_label = '%s_to_mean_%s' % (numeric_feature, categorical_feature)
    df_combined[mean_label] = df_combined[numeric_feature] / df_combined.groupby([categorical_feature])[numeric_feature].transform('mean')
    df_train[mean_label] = df_combined[mean_label].iloc[:len(df_train)]
    df_test[mean_label] = df_combined[mean_label].iloc[len(df_train):].reset_index(drop=True)
    continue_features += [mean_label]
    
make_percentage_features('magutchi', 'eki_nm1')
make_percentage_features('tt_mseki', 'eki_nm1')
make_percentage_features('tc_mseki', 'eki_nm1')
make_percentage_features('niwasaki', 'eki_nm1')
make_percentage_features('rosenka_hb', 'eki_nm1')
make_percentage_features('magutchi', 'city')
make_percentage_features('tt_mseki', 'city')
make_percentage_features('tc_mseki', 'city')
make_percentage_features('niwasaki', 'city')
make_percentage_features('rosenka_hb', 'city')

In [5]:
splitter = KFold(n_splits=5, shuffle=True, random_state=28)
price_stats = []
for train_idx, valid_idx in splitter.split(df_train):
    price_stats_by_city = defaultdict(dict)
    for city, group in df_train.iloc[train_idx].groupby('city'):
        price_list = group['keiyaku_pr']/group['tc_mseki']
        price_stats_by_city[city]['price_by_city_mean'] = price_list.mean()
        price_stats_by_city[city]['price_by_city_median'] = price_list.median()
        price_stats_by_city[city]['price_by_city_min'] = price_list.min()
        price_stats_by_city[city]['price_by_city_max'] = price_list.max()
        price_stats_by_city[city]['price_by_city_std'] = price_list.std()
        price_stats_by_city[city]['price_by_city_count'] = len(price_list)
    for i, city in enumerate(df_train.iloc[valid_idx]['city']):
        price_stats.append((valid_idx[i], price_stats_by_city[city]))

price_stats_test = [] 
price_stats_by_city = defaultdict(dict)
for city, group in df_train.groupby('city'):
    price_list = group['keiyaku_pr']/group['tc_mseki']
    price_stats_by_city[city]['price_by_city_mean'] = price_list.mean()
    price_stats_by_city[city]['price_by_city_median'] = price_list.median()
    price_stats_by_city[city]['price_by_city_min'] = price_list.min()
    price_stats_by_city[city]['price_by_city_max'] = price_list.max()
    price_stats_by_city[city]['price_by_city_std'] = price_list.std()
    price_stats_by_city[city]['price_by_city_count'] = len(price_list)
for city in df_test['city']:
    price_stats_test.append(price_stats_by_city[city])
    
df_price_stats = pd.DataFrame([x[1] for x in sorted(price_stats, key=lambda x: x[0])])
df_price_stats_test = pd.DataFrame(price_stats_test)

df_train = pd.concat([df_train, df_price_stats], axis=1)
df_test = pd.concat([df_test, df_price_stats_test], axis=1)

## Train

In [6]:
continue_features += ['price_by_city_mean', 'price_by_city_median', 'price_by_city_min', 'price_by_city_max', 
                      'price_by_city_std', 'price_by_city_count', 'lng', 'lat']
objective = 'keiyaku_pr'
categorical_features = list(df_train)

for f in continue_features+[objective]:
    if f in categorical_features:
        categorical_features.remove(f)
        
for col in categorical_features:
    if col not in ['pj_no']:
        ct = CategoricalTransformer(min_freq=3)
        df_train[col] = ct.fit_transform(df_train[col])
        df_test[col] = ct.transform(df_test[col])
        
for col in continue_features:
    if col != 'keiyaku_pr':
        scaler = MinMaxScaler()
        df_train[col] = scaler.fit_transform(df_train[col].values.reshape(-1, 1))
        df_test[col] = scaler.transform(df_test[col].values.reshape(-1, 1))
        df_train[col] = df_train[col].fillna(df_train[col].mean())
        df_test[col] = df_test[col].fillna(df_test[col].mean())
    
df_test['keiyaku_pr'] = 0
continue_features.remove('keiyaku_pr')

## Ridge

In [7]:
from sklearn import linear_model

splitter = KFold(n_splits=5, shuffle=True, random_state=28)
prediction_list = []
best_scores = []
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)
for train_idx, valid_idx in splitter.split(df_train):
    train, valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    test = df_test[df_train.drop(objective, axis=1).columns]
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    train_cat_one_hot = encoder.fit_transform(train[categorical_features])
    valid_cat_one_hot = encoder.transform(valid[categorical_features])
    test_cat_one_hot = encoder.transform(test[categorical_features])
    X_train, y_train = hstack([train[continue_features].values, train_cat_one_hot]), np.log(train['keiyaku_pr']+1)
    X_valid, y_valid = hstack([valid[continue_features].values, valid_cat_one_hot]), np.log(valid['keiyaku_pr']+1)
    regressor = linear_model.Ridge(alpha=3, tol=0.0001, random_state=28)
    regressor.fit(X_train, y_train)
    pred_val = regressor.predict(X_valid)
    prediction_list.append(regressor.predict(hstack([test[continue_features].values, test_cat_one_hot])))
    best_scores.append(mean_squared_error(y_valid, pred_val))
    
print("5-fold cv mean l2 %.8f" % np.mean(best_scores))

df_submission = pd.read_csv('./data/sample_submit.tsv', sep='\t', names=['id', 'pred'])

df_submission['pred'] = np.exp(np.mean(prediction_list, axis=0))-1
df_submission.to_csv('submission_ridge.tsv', sep='\t', header=None, index=False)

# 0.01266222

5-fold cv mean l2 0.01266222


## HuberRegressor

In [8]:
splitter = KFold(n_splits=5, shuffle=True, random_state=28)
prediction_list = []
best_scores = []
for train_idx, valid_idx in splitter.split(df_train):
    train, valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    test = df_test[df_train.drop(objective, axis=1).columns]
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    train_cat_one_hot = encoder.fit_transform(train[categorical_features])
    valid_cat_one_hot = encoder.transform(valid[categorical_features])
    test_cat_one_hot = encoder.transform(test[categorical_features])
    X_train, y_train = hstack([train[continue_features].values, train_cat_one_hot]), np.log(train['keiyaku_pr']+1)
    X_valid, y_valid = hstack([valid[continue_features].values, valid_cat_one_hot]), np.log(valid['keiyaku_pr']+1)
    regressor = linear_model.HuberRegressor(max_iter=500, epsilon=1.2, alpha=0.00001)
    regressor.fit(X_train, y_train)
    pred_val = regressor.predict(X_valid)
    prediction_list.append(regressor.predict(hstack([test[continue_features].values, test_cat_one_hot])))
    best_scores.append(mean_squared_error(y_valid, pred_val))
    
print("5-fold cv mean l2 %.8f" % np.mean(best_scores))

df_submission = pd.read_csv('./data/sample_submit.tsv', sep='\t', names=['id', 'pred'])

df_submission['pred'] = np.exp(np.mean(prediction_list, axis=0))-1
df_submission.to_csv('submission_huber.tsv', sep='\t', header=None, index=False)

# 0.01252956

5-fold cv mean l2 0.01252956


## Linear svr

In [9]:
from sklearn import svm

splitter = KFold(n_splits=5, shuffle=True, random_state=28)
prediction_list = []
best_scores = []
for train_idx, valid_idx in splitter.split(df_train):
    train, valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    test = df_test[df_train.drop(objective, axis=1).columns]
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    train_cat_one_hot = encoder.fit_transform(train[categorical_features])
    valid_cat_one_hot = encoder.transform(valid[categorical_features])
    test_cat_one_hot = encoder.transform(test[categorical_features])
    X_train, y_train = hstack([train[continue_features].values, train_cat_one_hot]), np.log(train['keiyaku_pr']+1)
    X_valid, y_valid = hstack([valid[continue_features].values, valid_cat_one_hot]), np.log(valid['keiyaku_pr']+1)
    regressor = svm.LinearSVR(C=0.1, epsilon=0.01, intercept_scaling=1.5, random_state=28)
    regressor.fit(X_train, y_train)
    pred_val = regressor.predict(X_valid)
    prediction_list.append(regressor.predict(hstack([test[continue_features].values, test_cat_one_hot])))
    best_scores.append(mean_squared_error(y_valid, pred_val))
    
print("5-fold cv mean l2 %.8f" % np.mean(best_scores))

df_submission = pd.read_csv('./data/sample_submit.tsv', sep='\t', names=['id', 'pred'])

df_submission['pred'] = np.exp(np.mean(prediction_list, axis=0))-1
df_submission.to_csv('submission_linear_svr.tsv', sep='\t', header=None, index=False)

# 0.01226565



5-fold cv mean l2 0.01226565




## Merge

In [10]:
df_1 = pd.read_csv('./submission.tsv', sep='\t', names=['id', 'pred'])
df_2 = pd.read_csv('./submission_ridge.tsv', sep='\t', names=['id', 'pred'])
df_3 = pd.read_csv('./submission_huber.tsv', sep='\t', names=['id', 'pred'])
df_4 = pd.read_csv('./submission_linear_svr.tsv', sep='\t', names=['id', 'pred'])

df_1['pred'] = df_1['pred']*0.6 + ((df_2['pred']*0.25 + df_3['pred']*0.75)*0.4 + df_4['pred']*0.6)*0.4
df_1.to_csv('submission_merge.tsv', sep='\t', index=False, header=None) 