Skip to content

Commit

Permalink
Merge pull request #12 from XinyaoWa/main
Browse files Browse the repository at this point in the history
update TwitterRecsys2021 readme and remove hostname/hardcode
  • Loading branch information
Jian-Zhang committed Sep 7, 2022
2 parents a67a7a5 + f5c2ba5 commit 390c399
Show file tree
Hide file tree
Showing 16 changed files with 59 additions and 38 deletions.
53 changes: 37 additions & 16 deletions modelzoo/TwitterRecSys2021/README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,45 @@
# Intel Democratized Solutions for RecSys Challenge 2021
# Intel Optimized Solutions for RecSys Challenge 2021

# How to Use
# RecSys Challenge 2021
The challenge focuses on a real-world task of tweet engagement prediction in a dynamic environment. Please check the [website](https://recsys.acm.org/recsys21/challenge/) on the details to get the dataset.

# Quick Start

## Prepare
1. Environment
* Spark 3.0
* Hadoop 3.2
* lightgbm 3.2.1
* XGBoost 1.3.3
* transformers 4.13.0
2. Prepare
* install pyrecdp
```bash
pip install pyrecdp
```
* Put original dataset(train, valid, test, valid_split_index) at HDFS: /recsys2021/oridata
* Create two folds in HDFS to save data for stage1 and stage2: /recsys2021/datapre_stage1/, /recsys2021/datapre_stage2/
```
export path_to_e2eaiok_dataset=`pwd`/e2eaiok_dataset # put the downloaded dataset here
export path_to_e2eaiok=`pwd`/e2eAIOK
git clone https://github.com/intel/e2eAIOK.git
git submodule update --init --recursive
```

## Environment setup
You can choose to use AIOK docker or prepare the environment by yourself.

### User AIOK Docker
```
cd ${path_to_e2eaiok}/Dockerfile-ubuntu18.04/
docker build -t e2eaiok-pytorch-spark . -f DockerfilePytorch
cd ${path_to_e2eaiok}
docker run --shm-size=10g -it --privileged --network host -v ${path_to_e2eaiok_dataset}:/home/vmagent/app/dataset -v `pwd`/:/home/vmagent/app/e2eaiok -w /home/vmagent/app/ e2eaiok-pytorch-spark /bin/bash
source /etc/profile.d/spark-env.sh
Install hadoop and update the setting
```

### Prepare environment by yourself
Install the following libs:
* Spark 3.0
* Hadoop 3.2
* lightgbm 3.2.1
* XGBoost 1.3.3
* transformers 4.13.0
* pyrecdp


## E2E Train
1. Preprocess train and valid dataset
1. Data Processing
* Put original dataset(train, valid, test, valid_split_index) at HDFS: /recsys2021/oridata
* Create two folds in HDFS to save data for stage1 and stage2: /recsys2021/datapre_stage1/, /recsys2021/datapre_stage2/
* Preprocess train data:
``` bash
cd data_preprocess
Expand Down
4 changes: 2 additions & 2 deletions modelzoo/TwitterRecSys2021/data_preprocess/datapre.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,7 +1003,7 @@ def setup_standalone(path_prefix,current_path,dicts_folder):
scala_udf_jars = recdp_path + "/ScalaProcessUtils/target/recdp-scala-extensions-0.1.0-jar-with-dependencies.jar"

t0 = timer()
spark = SparkSession.builder.master(f'spark://vsr119:7077')\
spark = SparkSession.builder.master(f'spark://hostname:7077')\
.appName("Recsys2021_data_process")\
.config("spark.driver.memory", '30g')\
.config("spark.local.dir", "/home/vmagent/app/dataset/spark")\
Expand Down Expand Up @@ -1150,7 +1150,7 @@ def valid_stage2():

def inference_decoder():
############# set up
path_prefix = "hdfs://vsr119:9000/"
path_prefix = "hdfs://hostname:9000/"
current_path = "/recsys2021/datapre_stage1/"
original_folder = "/recsys2021/oridata/test/"
dicts_folder = "recsys_dicts/"
Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/lgbm/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
pd.set_option('display.max_rows', 500)
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data"
model_save_path = f"{path}/models" ## model saving path
pred_save_path = f"{path}/result" ## prediction result saving path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
pd.set_option('display.max_rows', 500)
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data"
model_save_path = f"{path}/models" ## model saving path
pred_save_path = f"{path}/result" ## prediction result saving path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
pd.set_option('display.max_rows', 500)
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data"

distributed_nodes = 4
Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/lgbm/train_merge12.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"

if __name__ == "__main__":
df1 = pd.read_parquet(f"{path}/data/stage2_train")
Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/lgbm/train_stage1.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data" ## train and valid data path
model_save_path = f"{path}/models" ## model saving path
pred_save_path = f"{path}/result" ## prediction result saving path
Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/lgbm/train_stage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data" ## train and valid data path
model_save_path = f"{path}/models" ## model saving path

Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/xgboost/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
pd.set_option('display.max_rows', 500)
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data"
model_save_path = f"{path}/models" ## model saving path
pred_save_path = f"{path}/result" ## prediction result saving path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
pd.set_option('display.max_rows', 500)
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data"
model_save_path = f"{path}/models" ## model saving path
pred_save_path = f"{path}/result" ## prediction result saving path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
pd.set_option('display.max_rows', 500)
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data"

distributed_nodes = 4
Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/xgboost/train_merge12.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
very_start = time.time()

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"

if __name__ == "__main__":
df1 = pd.read_parquet(f"{path}/data/stage2_train")
Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/xgboost/train_stage1.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data" ## train and valid data path
model_save_path = f"{path}/models" ## model saving path
pred_save_path = f"{path}/result" ## prediction result saving path
Expand Down
2 changes: 1 addition & 1 deletion modelzoo/TwitterRecSys2021/model/xgboost/train_stage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
path = "/path/to/processed/data"
data_path = f"{path}/data" ## train and valid data path
model_save_path = f"{path}/models" ## model saving path

Expand Down
6 changes: 3 additions & 3 deletions modelzoo/TwitterRecSys2021/model_e2eaiok/xgboost/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ def parse_args(args):
help='training label names, can be reply, retweet, retweet_with_comment or like')
parser.add_argument('--train_data_path',
type=str,
default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/TwitterRecSys2021Dataset/stage1/train/train",
default="/datapath/stage1/train/train",
help='path to training data')
parser.add_argument('--valid_data_path',
type=str,
default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/TwitterRecSys2021Dataset/stage1/valid/valid",
default="/datapath/stage1/valid/valid",
help='path to validation data')
parser.add_argument('--model_save_path',
type=str,
default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/models",
default="/datapath/models",
help='path for model and result saving')
parser.add_argument('--max_depth',
type=int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@ def parse_args(args):
parser = argparse.ArgumentParser()
parser.add_argument('--data_path',
type=str,
default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/TwitterRecSys2021Dataset")
default="")
parser.add_argument('--reply_pred_path',
type=str,
default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055149/2f9dd8a6fe67fe190b0c1e015c6f60d5/xgboost_pred_stage1_reply.csv")
default="")
parser.add_argument('--retweet_pred_path',
type=str,
default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055240/d71e647911eadd50fed5693c8e02b436/xgboost_pred_stage1_retweet.csv")
default="")
parser.add_argument('--retweet_with_comment_pred_path',
type=str,
default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055303/d5aee6594f7c76ec0dc0b3e5ca1aaaa8/xgboost_pred_stage1_retweet_with_comment.csv")
default="")
parser.add_argument('--like_pred_path',
type=str,
default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055325/0eb3fe9e620acc16459bad6c08c7a7e1/xgboost_pred_stage1_like.csv")
default="")
return parser.parse_args(args)

if __name__ == "__main__":
Expand Down

0 comments on commit 390c399

Please sign in to comment.