# 104hackathon Recommendation Example

### 1. 使用pandas讀user_log.csv

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('user_log.csv', sep='|', dtype={'uid': str, 'jobNo': str})
train_df.head()

Unnamed: 0,uid,action,jobNo,invoice,dateTime,source,url,deviceType
0,240518173237,viewJob,7582959.0,69568009,2017-01-04 12:03:32,app,,1.0
1,1219770712231,viewCust,,15458455,2016-11-23 18:57:48,mobileWeb,m.104.com.tw/cust/73nkc7s,
2,712964574295,viewJob,6646654.0,52264848,2016-12-09 18:57:45,app,,1.0
3,292057776827,viewJob,7858416.0,86427715,2016-04-13 12:53:35,mobileWeb,m.104.com.tw/job/4oflc,
4,1039382087550,viewCust,,59504506,2017-02-20 22:30:27,web,www.104.com.tw/jobbank/custjob/index.php,


### 2. 濾掉viewCust的action，並計算每對（uid, jobNo, action）的數目

In [3]:
grouped_df = train_df[train_df.action != 'viewCust'].groupby(['uid', 'jobNo', 'action']).size().reset_index(name='count')
grouped_df.head()

Unnamed: 0,uid,jobNo,action,count
0,1005022347289,1003741,viewJob,1
1,1005022347289,1021902,applyJob,1
2,1005022347289,1021902,viewJob,1
3,1005022347289,1195525,applyJob,1
4,1005022347289,1195525,viewJob,2


### 3. 將action中的viewJob, saveJob, applyJob轉成獨立的column

In [4]:
pivot_df = grouped_df.pivot_table(index=['uid', 'jobNo'], columns='action', values='count').fillna(0)
pivot_df = pivot_df.reindex_axis(['viewJob', 'saveJob', 'applyJob'], axis=1)
pivot_df.head()

Unnamed: 0_level_0,action,viewJob,saveJob,applyJob
uid,jobNo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1005022347289,1003741,1.0,0.0,0.0
1005022347289,1021902,1.0,0.0,1.0
1005022347289,1195525,2.0,0.0,1.0
1005022347289,1599564,1.0,0.0,2.0
1005022347289,3182736,1.0,0.0,1.0


### 4. 將dataframe轉換成classification需要的X和y

In [5]:
X = pivot_df.values[:, :2]
y = pivot_df.values[:, 2]
y[y >= 1] = 1
print('X:', X)
print('y:', y)

X: [[ 1.  0.]
 [ 1.  0.]
 [ 2.  0.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]
y: [ 0.  1.  1. ...,  0.  0.  0.]


### 5. 使用sklearn的logistic regression

In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 6. 使用比賽當天拿到的testset，重複剛剛training set的動作，得到要預測的x_test

In [7]:
test_df = pd.read_csv('user_log_testset.csv', sep='|', dtype={'uid': str, 'jobNo': str})
grouped_test_df = test_df[test_df.action != 'viewCust'].groupby(['uid', 'jobNo', 'action']).size().reset_index(name='count')
pivot_test_df = grouped_test_df.pivot_table(index=['uid', 'jobNo'], columns='action', values='count').fillna(0)
pivot_test_df = pivot_test_df.reindex_axis(['viewJob', 'saveJob'], axis=1)
x_test = pivot_test_df.values

### 7. 使用model預測結果

In [8]:
preds = model.predict(x_test)

### 8. 將預測結果與testset的dataframe合併得到結果

In [9]:
pivot_test_df['applyJob'] = preds

In [10]:
preds_df = pivot_test_df.drop(['viewJob', 'saveJob'], axis=1)
preds_df['applyJob'] = preds.astype(int)

In [11]:
preds_df.head()

Unnamed: 0_level_0,action,applyJob
uid,jobNo,Unnamed: 2_level_1
1005022349260,5700211,0
1005022349260,6248819,0
1005022349260,6360974,0
1005022349260,6577740,0
1005022349260,6714349,0


In [12]:
preds_df.to_csv('preds.csv', sep='|')