In [33]:
from __future__ import division
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction import FeatureHasher

In [108]:
df_train = pd.read_csv('./train.csv')
y_train = df_train['revenue']
df_train.drop('revenue', axis=1, inplace=True)

In [76]:
hasher = FeatureHasher(n_features=100, input_type="string")
city_hash = hasher.transform(df_train.City)
def n_nonzero_columns(X):
    """Returns the number of non-zero columns in a CSR matrix X."""
    return len(np.unique(X.nonzero()[1]))
n_nonzero_columns(city_hash)

30

In [112]:
def extract_feature(df):
    """
    TODO Open Dateの正規化
    """
    df_ret = df.drop(['Id', 'City', 'City Group', 'Type', 'Open Date'], axis=1)
    open_date = df['Open Date'].apply(lambda m: int(m[6:] + m[0:2]))
    hasher = FeatureHasher(n_features=100, input_type="string")
    city_hash = hasher.transform(df.City)
    df_city_hash = pd.DataFrame(
        data=city_hash.todense(),
        index=df.index,
        columns=['city_' + str(c) for c in range(city_hash.shape[1])]
    )
    city_group_bin = pd.get_dummies(df['City Group'])
    type_bin = pd.get_dummies(df['Type'])
    df_ret = df_ret.join(df_city_hash)
    df_ret = df_ret.join(city_group_bin)
    df_ret = df_ret.join(type_bin)
    df_ret = df_ret.join(open_date)
    if 'MB' in df_ret.keys():
        df_ret.drop('MB', axis=1, inplace=True)
    return df_ret

In [113]:
X_train = extract_feature(df_train)

In [114]:
def cross_val(X, y, K, random_state=0, clf=None):
    """
    TODO Leave-one-outためす
    """
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    mse = cross_val_score(clf, X=X, y=y, cv=cv, n_jobs=1, scoring='mean_squared_error')
    rmse = np.sqrt(-1 * mse)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(rmse.mean(), rmse.std()))
    return rmse

In [80]:
def grid_search_logi(X_train, y_train):
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.80, random_state=19)
    
    test_parameters = [
        {'penalty': ['l1'], 'C': [1000]},
        {'penalty': ['l2'], 'C': [1, 10, 100, 1000]}

    ]
    cv = KFold(len(y_train), 20, shuffle=True, random_state=19)
    clf = GridSearchCV(
        LogisticRegression(),
        test_parameters,
        cv=cv,
        scoring='mean_squared_error',
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    print clf.best_estimator_
    
    print"\n+ トレーニングデータでCVした時の平均スコア:\n"
    for params, mean_score, all_scores in clf.grid_scores_:
        rmse = np.sqrt(-1 * all_scores)
        print "{:.3f} (+/- {:.3f}) for {}".format(rmse.mean(), rmse.std() / 2, params)

    print "\n+ テストデータでの識別結果:\n"
    y_pred = clf.predict(X_val)
    print calc_rmse(y_val, y_pred)
    return clf

In [81]:
def grid_search_svc(X_train, y_train):
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.80, random_state=19)
    
    test_parameters = [
        {'kernel': ['rbf'],    'C': [1000], 'gamma': [1e-3, 1e-4]},
        {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
    ]
    cv = KFold(len(y_train), 20, shuffle=True, random_state=19)
    clf = GridSearchCV(
        SVC(C=1),
        test_parameters,
        cv=cv,
        scoring='mean_squared_error',
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    print clf.best_estimator_
    
    print"\n+ トレーニングデータでCVした時の平均スコア:\n"
    for params, mean_score, all_scores in clf.grid_scores_:
        rmse = np.sqrt(-1 * all_scores)
        print "{:.3f} (+/- {:.3f}) for {}".format(rmse.mean(), rmse.std() / 2, params)

    print "\n+ テストデータでの識別結果:\n"
    y_pred = clf.predict(X_val)
    print calc_rmse(y_val, y_pred)
    return clf

In [82]:
def calc_rmse(predicted, test):
    return np.sqrt(np.sum((predicted - test)**2)/len(predicted))

In [10]:
def grid_search_random_forest(X_train, y_train):
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.80, random_state=19)

    test_parameters = {
        'n_estimators': [100, 1000, 10000],
#        'max_depth': [5, 6, 7, 8],
#        'min_samples_leaf': [1, 2, 3],
#        'max_features': np.rint(np.linspace(sqrtfeat, sqrtfeat+2, 3)).astype(int),
#        'min_samples_split': np.rint(np.linspace(X_train.shape[0]*.01, X_train.shape[0]*.05, 3)).astype(int)
    }
    cv = KFold(len(y_train), 20, shuffle=True, random_state=19)
    clf = GridSearchCV(
        RandomForestClassifier(oob_score=True, n_estimators=10000, n_jobs=-1),
        test_parameters,
        cv=cv,
        scoring='mean_squared_error',
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    print clf.best_estimator_
    
    print"\n+ トレーニングデータでCVした時の平均スコア:\n"
    for params, mean_score, all_scores in clf.grid_scores_:
        rmse = np.sqrt(-1 * all_scores)
        print "{:.3f} (+/- {:.3f}) for {}".format(rmse.mean(), rmse.std() / 2, params)

    print "\n+ テストデータでの識別結果:\n"
    y_pred = clf.predict(X_val)
    print calc_rmse(y_val, y_pred)
    return clf

In [11]:
#best_random_forest_clf = grid_search_random_forest(X_train, y_train)
#print "============================================"
#best_svc_clf = grid_search_svc(X_train, y_train)
#print "============================================"
#best_logistic_clf = grid_search_logi(X_train, y_train)

In [12]:
print "Logistic Regression"
cross_val(X_train, y_train, 20, clf=LogisticRegression(penalty='l2', tol=0.01))
print "Random forest"
cross_val(X_train, y_train, 20, clf=RandomForestClassifier())
print "Random forest Best"
cross_val(X_train, y_train, 20, clf=RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=6, max_features=8,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=3,
            min_samples_split=1, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0))

Logistic Regression
Mean Score: 3420556.002 (+/-1447058.116)
Random forest
Mean Score: 3213026.912 (+/-1659911.946)
Random forest Best
Mean Score: 3272630.771 (+/-1559315.043)


array([ 5069466.83271694,  2794968.79422682,  1753282.97382453,
        4800957.09391391,  1918856.02287371,  5256764.10361441,
        3517289.80343251,  1269551.45283527,  2279513.66873588,
         868597.9598495 ,  2617921.47582055,  6342910.72941198,
        5092976.38704662,  2924595.48113325,  4231564.41111749,
        1571435.33261674,  1968512.45069124,  2759480.96006402,
        2960548.76548071,  5453420.71523357])

In [115]:
df_test = pd.read_csv('./test.csv')

In [116]:
X_test = extract_feature(df_test)

In [117]:
clf = RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=6, max_features=8,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=3,
            min_samples_split=1, n_estimators=1000,
            oob_score=True, random_state=None, verbose=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=6, max_features=8,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=3,
            min_samples_split=1, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0)

In [119]:
y_test = clf.predict(X_test)

In [133]:
df_test['Prediction'] = y_test

In [134]:
df_submit = df_test[['Id', 'Prediction']]

In [135]:
df_submit.to_csv('./submit_20150505_01.csv', index=False)