# Recommendation Training Set Example

### 1. 使用pandas讀user_log.csv

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('user_log.csv', sep='|', dtype={'uid': str, 'jobNo': str})
train_df.head()

Unnamed: 0,uid,action,jobNo,source
0,1245540518330,viewJob,8338295,app
1,1245540518330,viewJob,8338295,app
2,1245540518330,applyJob,8338295,app
3,1245540518330,viewJob,5117491,app
4,1245540518330,applyJob,5117491,app


### 2. 濾掉viewCust的action，並計算每對（uid, jobNo, action, source）的數目

In [3]:
grouped_df = train_df[train_df.action != 'viewCust'].groupby(['uid', 'jobNo', 'action', 'source']).size().reset_index(name='count')
grouped_df.head()

Unnamed: 0,uid,jobNo,action,source,count
0,10,4518869,viewJob,app,2
1,10,4716652,viewJob,app,5
2,10,6160395,viewJob,app,1
3,10,6335818,viewJob,app,1
4,10,6338116,saveJob,app,1


### 3. 將action及source中的viewJob, saveJob, applyJob, app, mobileWeb, web轉成獨立的column

In [4]:
pivot_df = grouped_df.pivot_table(index=['uid', 'jobNo'], columns=['action', 'source'], values='count').fillna(0)
pivot_df = pivot_df.reindex_axis(['viewJob', 'saveJob', 'applyJob', 'app', 'web', 'mobileWeb'], level=1, axis=1)
pivot_df.head()

Unnamed: 0_level_0,action,applyJob,applyJob,applyJob,saveJob,saveJob,saveJob,viewJob,viewJob,viewJob
Unnamed: 0_level_1,source,app,web,mobileWeb,app,web,mobileWeb,app,web,mobileWeb
uid,jobNo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
10,4518869,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
10,4716652,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
10,6160395,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,6335818,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,6338116,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### 4. 將dataframe轉換成classification需要的X和y

In [5]:
X = pivot_df.values[:, 3:]
y = pivot_df.values[:, :3]
y = y.sum(axis=1)
y[y >= 1] = 1
print('X:', X)
print('y:', y)

X: [[ 0.  0.  0.  2.  0.  0.]
 [ 0.  0.  0.  5.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 ..., 
 [ 0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  1.  0.  0.]]
y: [ 0.  0.  0. ...,  0.  0.  0.]


### 5. 使用stratified sampling分割training set，試跑sklearn的logistic regression的f1 score

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
skf = StratifiedKFold(n_splits=3)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f1_score(y_test, preds))

0.304428379139
0.295545122722
0.304806104072
