# Add grid search

素性

- course_id
- period (enrollしてからコースの終了までの日数)
- access回数
- discussion回数
- navigate回数
- page_close回数

In [1]:
from datetime import datetime

from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
courses = pd.read_csv('./data/date.csv', index_col='course_id')
courses['to_date'] = courses['to'].apply(lambda d:datetime.strptime(d, '%Y-%m-%d'))
courses['from_date'] = courses['from'].apply(lambda d:datetime.strptime(d, '%Y-%m-%d'))

In [3]:
modules = pd.read_csv('./data/object.csv')

In [4]:
train_enroll_df = pd.read_csv('./data/train/enrollment_train.csv', index_col='enrollment_id')
train_log_df = pd.read_csv('./data/train/log_train.csv')

In [5]:
#modules.groupby(['course_id', 'category']).category.count()

In [6]:
train_y = pd.read_csv('./data/train/truth_train.csv', header=None, names=('enrollment_id', 'dropout'), index_col='enrollment_id')

In [7]:
def extract_features(enroll_df, log_df):
    # enroll毎のlogの最初の時刻をenroll日時とする
    enroll_date = log_df.groupby('enrollment_id').time.first().apply(lambda d:datetime.strptime(d, '%Y-%m-%dT%H:%M:%S'))
    enroll_df['enroll_date'] = enroll_date
    
    # enrollしてからコースの終了までの期間
    tmp = enroll_df.join(courses['to_date'], on='course_id')
    enroll_df['period'] = (tmp['to_date'] - tmp['enroll_date']).apply(lambda d:d/np.timedelta64(1, 'D'))
    
    tmp = log_df.groupby(['enrollment_id', 'event']).event.count()
    events = pd.DataFrame(tmp).unstack().event.fillna(0)
    
    x = enroll_df.join(events)
    x = x[['period', 'access', 'discussion', 'navigate', 'page_close', 'problem', 'video', 'wiki']]
    d = pd.get_dummies(enroll_df.course_id)
    return x.join(d)

In [8]:
X_train = extract_features(train_enroll_df, train_log_df)
y_train = train_y.dropout

In [9]:
def cross_val(X, y, K, random_state=0, clf=None):
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=cv)
    print('Scores:', scores)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
    return scores

In [10]:
def grid_search_logi(X_train, y_train):
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.83, random_state=19)
    
    test_parameters = [
        {'penalty': ['l1', 'l2'], 'C': [1, 0.1, 0.01]},
    ]
    clf = GridSearchCV(
        LogisticRegression(),
        test_parameters,
        cv=20,
        scoring='accuracy',
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    print clf.best_estimator_
    
    print"\n+ トレーニングデータでCVした時の平均スコア:\n"
    for params, mean_score, all_scores in clf.grid_scores_:
        print "{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params)

    print "\n+ テストデータでの識別結果:\n"
    y_true, y_pred = y_val, clf.predict(X_val)
    print classification_report(y_true, y_pred)
    return clf

In [11]:
def calc_classifier(enroll_df, log_df, y_train, clf=None):
    X_train = extract_features(enroll_df, log_df)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=19)
    print('Num of Training Samples: {}'.format(len(X_train)))
    print('Num of Validation Samples: {}'.format(len(X_val)))
    
    if clf is None:
        clf = get_classifier()
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_val_pred = clf.predict(X_val)
    print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
    print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))
    cm = confusion_matrix(y_val, y_val_pred)
    return clf

In [None]:
clf = grid_search_logi(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l1', random_state=None, tol=0.0001)

+ トレーニングデータでCVした時の平均スコア:

0.855 (+/- 0.002) for {'penalty': 'l1', 'C': 1}
0.855 (+/- 0.002) for {'penalty': 'l2', 'C': 1}
0.855 (+/- 0.002) for {'penalty': 'l1', 'C': 0.1}
0.855 (+/- 0.002) for {'penalty': 'l2', 'C': 0.1}
0.853 (+/- 0.002) for {'penalty': 'l1', 'C': 0.01}
0.854 (+/- 0.002) for {'penalty': 'l2', 'C': 0.01}

+ テストデータでの識別結果:

             precision    recall  f1-score   support

          0       0.79      0.43      0.56      4310
          1       0.86      0.97      0.91     16183

avg / total       0.85      0.86      0.84     20493



In [None]:
cross_val(X_train, y_train, 5, clf=clf)

In [None]:
#clf = calc_classifier(train_enroll_df, train_log_df, y_train, clf=LogisticRegression(C=0.5, penalty='l2', tol=0.01))

In [14]:
test_enroll_df = pd.read_csv('./data/test/enrollment_test.csv', index_col='enrollment_id')
test_log_df = pd.read_csv('./data/test/log_test.csv')

In [15]:
X_test = extract_features(test_enroll_df, test_log_df)

In [16]:
X_test.head()

Unnamed: 0_level_0,period,access,discussion,navigate,page_close,problem,video,wiki,1pvLqtotBsKv7QSOsLicJDQMHx3lui6d,3VkHkmOtom3jM2wCu94xgzzu1d6Dn7or,...,bWdj2GDclj5ofokWjzoa5jAwMkxCykd6,fbPkOYLVPtPgIt0MxizjfFJov3JbHyAi,gvEwgd64UX4t3K7ftZwXiMkFuxFUAqQE,mTmmr5zd8l4wXhwiULwjSmSbi9ktcFmV,nSfGxfEtzw5G72fVbfaowxsV46Pg1xIc,q6A6QG7qMpyNcznyT2XaIxnfNGkZRxXl,shM3Yy9vxHn2aqjSYfQXOcwGo0hWh3MI,tXbz2ZYaRyb2ZsWUBPoYzAmisOhHQrYl,xMd9DzNyUCTLRPVbwWVzf4vq06oqrTT1,ykoe1cCWK134BJmfbNoPEenJOIWdtQOZ
enrollment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,27.588322,239,324,87,143,20,50,12,0,0,...,0,0,0,0,0,0,0,0,0,0
8,26.552627,127,7,20,90,50,58,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10,8.997975,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,27.959595,164,27,60,20,21,4,2,0,0,...,0,0,0,0,0,0,0,0,0,0
15,13.934248,30,0,11,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y_test = clf.predict_proba(X_test)

In [26]:
y_test[:, 1]

array([  1.62787968e-04,   8.48508844e-02,   8.80126002e-01, ...,
         9.18909102e-01,   9.18836347e-01,   9.11338853e-01])

In [27]:
test_enroll_df['predicted'] = y_test[:, 1]

In [28]:
test_enroll_df['predicted'].to_csv('./submit_20150702_01_hagino3000.csv', index=True)