In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv('train.csv', index_col=0)
raw_data = df.iloc[:, :-1]
target = df.iloc[:, -1]

In [3]:
def preprocess_data(data):
    """ preprocessing DataFrame for analysis
        data: DataFrame
    """
    # 文字列データの処理
    # Charデータのカテゴライズ
    data['cancellation_policy'] = data['cancellation_policy'].map(
        {'flexible': 0, 'moderate': 1, 'strict': 2, 'super_strict_30': 3, 'super_strict_60': 4})
    data['room_type'] = data['room_type'].map({'Shared room': 0, 'Private room': 1, 'Entire home/apt': 2})
    # test.csvにのみ’Lighthouse’があるので注意
    other_list = ['Cabin','Guest suite','Guesthouse', 'Other', 'Bungalow', 'Villa', 'Bed & Breakfast',
                  'Dorm', 'Timeshare', 'Camper/RV', 'Cave', 'Hostel', 'Earth House', 'In-law',
                  'Serviced apartment', 'Boat', 'Tent', 'Castle', 'Boutique hotel',
                  'Vacation home', 'Hut', 'Treehouse', 'Yurt', 'Chalet', 'Island', 'Tipi', 'Train',
                  'Parking Space', 'Casa particular', 'Lighthouse']
    data['property_type'] = data['property_type'].replace(other_list, 'Other')
    # カテゴリーできないデータのダミー変数化
    data = pd.get_dummies(data, columns=['bed_type' , 'city', 'property_type'])
    # host_response_rateのfloatへの変換
    data['host_response_rate'] = data['host_response_rate'].str.rstrip('%')
    data['host_response_rate'] = data['host_response_rate'].astype(float) / 100
    data['host_response_rate'] = data['host_response_rate'].replace(np.nan, 0)
    # true or falseの列を変換する
    data = data.replace({'t': 1, 'f': 0})
    # 欠損値の処理
    # bathrooms, bedrooms, bedsは平均値で埋める。
    for column in ['bathrooms', 'bedrooms', 'beds']:
        data[column] = data[column].fillna(data[column].mean())
    # host_hasprofile_pic, host_identity_verifiedは0で埋める
    data['host_has_profile_pic'] = data['host_has_profile_pic'].fillna(0)
    data['host_identity_verified'] = data['host_identity_verified'].fillna(0)
    # スコアがないのはデフォルトで50点としておこうか。
    data['review_scores_rating'] = data['review_scores_rating'].fillna(50)
    # 関連なさそうな不要な列の削除。アメニティは関連ありそうだがいったん削除して推定する
    data = data.drop(['amenities', 'description', 'first_review', 'host_since', 'last_review', 'latitude',
                  'longitude', 'name', 'neighbourhood','thumbnail_url', 'zipcode'], axis=1)
    return data

In [4]:
data = preprocess_data(raw_data)
data.head()

Unnamed: 0_level_0,accommodates,bathrooms,bedrooms,beds,cancellation_policy,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,...,city_DC,city_LA,city_NYC,city_SF,property_type_Apartment,property_type_Condominium,property_type_House,property_type_Loft,property_type_Other,property_type_Townhouse
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6,2.0,1.0,4.0,0,1,1.0,0.0,0.0,0,...,0,1,0,0,1,0,0,0,0,0
1,2,1.0,1.0,1.0,2,1,1.0,1.0,1.0,0,...,1,0,0,0,0,0,1,0,0,0
2,2,2.0,1.0,1.0,2,1,1.0,0.0,1.0,1,...,0,0,1,0,1,0,0,0,0,0
3,2,1.0,1.0,1.0,2,1,1.0,1.0,1.0,1,...,0,0,0,1,1,0,0,0,0,0
4,2,1.0,1.0,1.0,2,1,1.0,1.0,1.0,0,...,0,0,1,0,1,0,0,0,0,0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=0)
forest = RandomForestRegressor(n_estimators=100, random_state=0, max_depth=10)
forest.fit(X_train, y_train)
print("RandomForest")
print("Train score {}".format(forest.score(X_train, y_train)))
print("Test score {}".format(forest.score(X_test, y_test)))
print("RSME score: {}".format(np.sqrt(mean_squared_error(y_test, forest.predict(X_test)))))
gbrt = GradientBoostingRegressor(random_state=0, max_depth=5)
gbrt.fit(X_train, y_train)
print("GradientBoosting")
print("Train score {}".format(gbrt.score(X_train, y_train)))
print("Test score {}".format(gbrt.score(X_test, y_test)))
print("RSME score: {}".format(np.sqrt(mean_squared_error(y_test, gbrt.predict(X_test)))))

RandomForest
Train score 0.6511476472313165
Test score 0.5303686600026316
RSME score: 117.6154675983588
GradientBoosting
Train score 0.6025970242272962
Test score 0.5360045866774883
RSME score: 116.9076007481279


In [7]:
test_df = pd.read_csv('test.csv', index_col=0)
test_df.head()

Unnamed: 0_level_0,accommodates,amenities,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,description,...,latitude,longitude,name,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,thumbnail_url,zipcode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",2.0,Real Bed,2.0,2.0,strict,Boston,t,Feel free to book INSTANTLY. You can check-in ...,...,42.359278,-71.069962,Gorgeous 2BR/2BA Duplex in Beacon Hill,Beacon Hill,58,House,90.0,Entire home/apt,https://a0.muscache.com/im/pictures/7e4808b4-5...,2114.0
1,3,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,moderate,LA,t,The guest house is close to: Equinox West Holl...,...,34.084747,-118.367355,Luxury 1 Bedroom West Hollywood City Center,West Hollywood,4,Guesthouse,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/5392fbd6-6...,90046.0
2,2,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.0,Real Bed,0.0,1.0,flexible,NYC,f,Private room in a three bedroom apartment in N...,...,40.720541,-73.959192,Bedroom with Patio in Prime Williamsburg Locat...,Williamsburg,0,Apartment,,Private room,https://a0.muscache.com/im/pictures/544d3b89-d...,11249.0
3,4,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,2.0,strict,NYC,f,The apartment is located in historic Bed Stuy ...,...,40.681117,-73.944091,Cozy apartment in Brooklyn,Bedford-Stuyvesant,0,Apartment,,Entire home/apt,https://a0.muscache.com/im/pictures/26baf7ba-0...,11216.0
4,3,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.5,Real Bed,1.0,2.0,strict,LA,t,"Our cozy, pet friendly one bedroom apartment/l...",...,34.150995,-118.409359,"Cozy, sunny, pet friendly loft/apt",,6,Loft,92.0,Entire home/apt,https://a0.muscache.com/im/pictures/86107545/9...,91604.0


In [8]:
test_data = preprocess_data(test_df)
test_data.head()

Unnamed: 0_level_0,accommodates,bathrooms,bedrooms,beds,cancellation_policy,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,...,city_DC,city_LA,city_NYC,city_SF,property_type_Apartment,property_type_Condominium,property_type_House,property_type_Loft,property_type_Other,property_type_Townhouse
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6,2.0,2.0,2.0,2,1,1.0,0.0,1.0,1,...,0,0,0,0,0,0,1,0,0,0
1,3,1.0,1.0,1.0,1,1,1.0,1.0,1.0,0,...,0,1,0,0,0,0,0,0,1,0
2,2,1.0,0.0,1.0,0,0,1.0,1.0,1.0,0,...,0,0,1,0,1,0,0,0,0,0
3,4,1.0,1.0,2.0,2,0,1.0,1.0,0.0,0,...,0,0,1,0,1,0,0,0,0,0
4,3,1.5,1.0,2.0,2,1,1.0,1.0,1.0,0,...,0,1,0,0,0,0,0,1,0,0


In [11]:
def predict_data(model, test, file_name):
    """ predcit data to csv
        model: fitted ML model
        test: DataFrame for test
        file_name: file name for submittion
    """
    prediction = model.predict(test)
    predict_df = pd.DataFrame([test.index, prediction]).T
    predict_df.to_csv(file_name, header=False, index=False)

In [12]:
# predict_data(forest, test_data, 'submit.csv')
# 暫定スコア158.75681667763246

In [13]:
# predict_data(gbrt, test_data, 'submit1.csv')
# 暫定スコア157.43192011676345

In [14]:
# 標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, target, random_state=0)

In [16]:
# ニューラルネットワークを検討
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(max_iter=10000,
                   hidden_layer_sizes=(100,), 
                   activation='relu',
                   solver='adam',
                   learning_rate_init=0.001)
mlp.fit(X_train, y_train)
print("MLPRegressor")
print("Train score {}".format(mlp.score(X_train, y_train)))
print("Test score {}".format(mlp.score(X_test, y_test)))
print("RSME score: {}".format(np.sqrt(mean_squared_error(y_test, mlp.predict(X_test)))))

MLPRegressor
Train score 0.5714910658764536
Test score 0.5329005935535371
RSME score: 117.29798771009828


In [17]:
# 学習データでスケーリンクしたものをテストデータに適用
scaled_test_data = pd.DataFrame(scaler.transform(test_data), index=test_data.index)

In [18]:
predict_data(mlp, scaled_test_data, 'submit3.csv')
# 暫定スコア156.94397553555822