Merge pull request #12 from XinyaoWa/main

update TwitterRecsys2021 readme and remove hostname/hardcode
intel · Sep 7, 2022 · 390c399 · 390c399
2 parents a67a7a5 + f5c2ba5
commit 390c399
Show file tree

Hide file tree

Showing 16 changed files with 59 additions and 38 deletions.
diff --git a/modelzoo/TwitterRecSys2021/README.md b/modelzoo/TwitterRecSys2021/README.md
@@ -1,24 +1,45 @@
-# Intel Democratized Solutions for RecSys Challenge 2021 
+# Intel Optimized Solutions for RecSys Challenge 2021 
 
-# How to Use
+# RecSys Challenge 2021 
+The challenge focuses on a real-world task of tweet engagement prediction in a dynamic environment. Please check the [website](https://recsys.acm.org/recsys21/challenge/) on the details to get the dataset. 
+
+# Quick Start
 
 ## Prepare
-1. Environment
-    * Spark 3.0
-    * Hadoop 3.2
-    * lightgbm 3.2.1
-    * XGBoost 1.3.3
-    * transformers 4.13.0
-2. Prepare 
-    * install pyrecdp
-    ```bash
-    pip install pyrecdp
-    ```
-    * Put original dataset(train, valid, test, valid_split_index) at HDFS: /recsys2021/oridata
-    * Create two folds in HDFS to save data for stage1 and stage2: /recsys2021/datapre_stage1/, /recsys2021/datapre_stage2/
+```
+export path_to_e2eaiok_dataset=`pwd`/e2eaiok_dataset # put the downloaded dataset here
+export path_to_e2eaiok=`pwd`/e2eAIOK
+git clone https://github.com/intel/e2eAIOK.git
+git submodule update --init --recursive
+```
+
+## Environment setup
+You can choose to use AIOK docker or prepare the environment by yourself.
+
+### User AIOK Docker
+```
+cd ${path_to_e2eaiok}/Dockerfile-ubuntu18.04/
+docker build -t e2eaiok-pytorch-spark . -f DockerfilePytorch
+cd ${path_to_e2eaiok}
+docker run --shm-size=10g -it --privileged --network host -v ${path_to_e2eaiok_dataset}:/home/vmagent/app/dataset -v `pwd`/:/home/vmagent/app/e2eaiok -w /home/vmagent/app/ e2eaiok-pytorch-spark /bin/bash
+source /etc/profile.d/spark-env.sh
+Install hadoop and update the setting
+```
+
+### Prepare environment by yourself
+Install the following libs:
+ * Spark 3.0
+ * Hadoop 3.2
+ * lightgbm 3.2.1
+ * XGBoost 1.3.3
+ * transformers 4.13.0
+ * pyrecdp
+
 
 ## E2E Train
-1. Preprocess train and valid dataset
+1. Data Processing
+    * Put original dataset(train, valid, test, valid_split_index) at HDFS: /recsys2021/oridata
+    * Create two folds in HDFS to save data for stage1 and stage2: /recsys2021/datapre_stage1/, /recsys2021/datapre_stage2/
     * Preprocess train data:
         ``` bash
         cd data_preprocess

diff --git a/modelzoo/TwitterRecSys2021/data_preprocess/datapre.py b/modelzoo/TwitterRecSys2021/data_preprocess/datapre.py
@@ -1003,7 +1003,7 @@ def setup_standalone(path_prefix,current_path,dicts_folder):
     scala_udf_jars = recdp_path + "/ScalaProcessUtils/target/recdp-scala-extensions-0.1.0-jar-with-dependencies.jar"
 
     t0 = timer()
-    spark = SparkSession.builder.master(f'spark://vsr119:7077')\
+    spark = SparkSession.builder.master(f'spark://hostname:7077')\
         .appName("Recsys2021_data_process")\
         .config("spark.driver.memory", '30g')\
         .config("spark.local.dir", "/home/vmagent/app/dataset/spark")\
@@ -1150,7 +1150,7 @@ def valid_stage2():
 
 def inference_decoder():
     ############# set up
-    path_prefix = "hdfs://vsr119:9000/"
+    path_prefix = "hdfs://hostname:9000/"
     current_path = "/recsys2021/datapre_stage1/"
     original_folder = "/recsys2021/oridata/test/"
     dicts_folder = "recsys_dicts/"

diff --git a/modelzoo/TwitterRecSys2021/model/lgbm/inference.py b/modelzoo/TwitterRecSys2021/model/lgbm/inference.py
@@ -8,7 +8,7 @@
 pd.set_option('display.max_rows', 500)
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"
 model_save_path = f"{path}/models"  ## model saving path
 pred_save_path = f"{path}/result"   ## prediction result saving path

diff --git a/modelzoo/TwitterRecSys2021/model/lgbm/inference_distributed/inference.py b/modelzoo/TwitterRecSys2021/model/lgbm/inference_distributed/inference.py
@@ -8,7 +8,7 @@
 pd.set_option('display.max_rows', 500)
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"
 model_save_path = f"{path}/models"  ## model saving path
 pred_save_path = f"{path}/result"   ## prediction result saving path

diff --git a/modelzoo/TwitterRecSys2021/model/lgbm/inference_distributed/split_data.py b/modelzoo/TwitterRecSys2021/model/lgbm/inference_distributed/split_data.py
@@ -5,7 +5,7 @@
 pd.set_option('display.max_rows', 500)
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"
 
 distributed_nodes = 4

diff --git a/modelzoo/TwitterRecSys2021/model/lgbm/train_merge12.py b/modelzoo/TwitterRecSys2021/model/lgbm/train_merge12.py
@@ -3,7 +3,7 @@
 import time
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 
 if __name__ == "__main__":
     df1 = pd.read_parquet(f"{path}/data/stage2_train")

diff --git a/modelzoo/TwitterRecSys2021/model/lgbm/train_stage1.py b/modelzoo/TwitterRecSys2021/model/lgbm/train_stage1.py
@@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
     strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
     return (1.0 - cross_entropy/strawman_cross_entropy)*100.0
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"  ## train and valid data path
 model_save_path = f"{path}/models"  ## model saving path
 pred_save_path = f"{path}/result"   ## prediction result saving path

diff --git a/modelzoo/TwitterRecSys2021/model/lgbm/train_stage2.py b/modelzoo/TwitterRecSys2021/model/lgbm/train_stage2.py
@@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
     strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
     return (1.0 - cross_entropy/strawman_cross_entropy)*100.0
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"  ## train and valid data path
 model_save_path = f"{path}/models"  ## model saving path
 

diff --git a/modelzoo/TwitterRecSys2021/model/xgboost/inference.py b/modelzoo/TwitterRecSys2021/model/xgboost/inference.py
@@ -8,7 +8,7 @@
 pd.set_option('display.max_rows', 500)
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"
 model_save_path = f"{path}/models"  ## model saving path
 pred_save_path = f"{path}/result"   ## prediction result saving path

diff --git a/modelzoo/TwitterRecSys2021/model/xgboost/inference_distributed/inference.py b/modelzoo/TwitterRecSys2021/model/xgboost/inference_distributed/inference.py
@@ -8,7 +8,7 @@
 pd.set_option('display.max_rows', 500)
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"
 model_save_path = f"{path}/models"  ## model saving path
 pred_save_path = f"{path}/result"   ## prediction result saving path

diff --git a/modelzoo/TwitterRecSys2021/model/xgboost/inference_distributed/split_data.py b/modelzoo/TwitterRecSys2021/model/xgboost/inference_distributed/split_data.py
@@ -5,7 +5,7 @@
 pd.set_option('display.max_rows', 500)
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"
 
 distributed_nodes = 4

diff --git a/modelzoo/TwitterRecSys2021/model/xgboost/train_merge12.py b/modelzoo/TwitterRecSys2021/model/xgboost/train_merge12.py
@@ -3,7 +3,7 @@
 import time
 very_start = time.time()
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 
 if __name__ == "__main__":
     df1 = pd.read_parquet(f"{path}/data/stage2_train")

diff --git a/modelzoo/TwitterRecSys2021/model/xgboost/train_stage1.py b/modelzoo/TwitterRecSys2021/model/xgboost/train_stage1.py
@@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
     strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
     return (1.0 - cross_entropy/strawman_cross_entropy)*100.0
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"  ## train and valid data path
 model_save_path = f"{path}/models"  ## model saving path
 pred_save_path = f"{path}/result"   ## prediction result saving path

diff --git a/modelzoo/TwitterRecSys2021/model/xgboost/train_stage2.py b/modelzoo/TwitterRecSys2021/model/xgboost/train_stage2.py
@@ -24,7 +24,7 @@ def compute_rce_fast(pred, gt):
     strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
     return (1.0 - cross_entropy/strawman_cross_entropy)*100.0
 
-path = "/mnt/sdb/xinyao/2optimize/nvidia2021/3mergeall/recsys2021-intel-opt"
+path = "/path/to/processed/data"
 data_path = f"{path}/data"  ## train and valid data path
 model_save_path = f"{path}/models"  ## model saving path
 

diff --git a/modelzoo/TwitterRecSys2021/model_e2eaiok/xgboost/train.py b/modelzoo/TwitterRecSys2021/model_e2eaiok/xgboost/train.py
@@ -38,15 +38,15 @@ def parse_args(args):
                         help='training label names, can be reply, retweet, retweet_with_comment or like')
     parser.add_argument('--train_data_path',
                         type=str,
-                        default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/TwitterRecSys2021Dataset/stage1/train/train",
+                        default="/datapath/stage1/train/train",
                         help='path to training data')
     parser.add_argument('--valid_data_path',
                         type=str,
-                        default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/TwitterRecSys2021Dataset/stage1/valid/valid",
+                        default="/datapath/stage1/valid/valid",
                         help='path to validation data')
     parser.add_argument('--model_save_path',
                         type=str,
-                        default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/models",
+                        default="/datapath/models",
                         help='path for model and result saving')
     parser.add_argument('--max_depth',
                         type=int,

diff --git a/modelzoo/TwitterRecSys2021/model_e2eaiok/xgboost/train_merge12.py b/modelzoo/TwitterRecSys2021/model_e2eaiok/xgboost/train_merge12.py
@@ -9,19 +9,19 @@ def parse_args(args):
     parser = argparse.ArgumentParser()
     parser.add_argument('--data_path',
                         type=str,
-                        default="/home/vmagent/app/dataset/xinyao/TwitterRecSys2021-intel-opt/TwitterRecSys2021Dataset")
+                        default="")
     parser.add_argument('--reply_pred_path',
                         type=str,
-                        default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055149/2f9dd8a6fe67fe190b0c1e015c6f60d5/xgboost_pred_stage1_reply.csv")
+                        default="")
     parser.add_argument('--retweet_pred_path',
                         type=str,
-                        default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055240/d71e647911eadd50fed5693c8e02b436/xgboost_pred_stage1_retweet.csv")
+                        default="")
     parser.add_argument('--retweet_with_comment_pred_path',
                         type=str,
-                        default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055303/d5aee6594f7c76ec0dc0b3e5ca1aaaa8/xgboost_pred_stage1_retweet_with_comment.csv")
+                        default="")
     parser.add_argument('--like_pred_path',
                         type=str,
-                        default="/home/vmagent/app/hydro.ai/result/twitter_recsys/20211217_055325/0eb3fe9e620acc16459bad6c08c7a7e1/xgboost_pred_stage1_like.csv")
+                        default="")
     return parser.parse_args(args)
 
 if __name__ == "__main__":