In [22]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

from operator import add
import sys,os
from pyspark.sql.types import *


class Config:
    def __init__(self,
                elasticsearch_host,
                elasticsearch_port,
                elasticsearch_input_json,
                elasticsearch_nodes_wan_only,
                hdfs_namenode
                 ):
        self.elasticsearch_conf = {
            'es.nodes': elasticsearch_host,
            'es.port': elasticsearch_port,
            "es.input.json":elasticsearch_input_json,
            "es.nodes.wan.only": elasticsearch_nodes_wan_only
        }
        self.hdfs_namenode = hdfs_namenode
        self.spark_app = None
        
    def get_elasticsearch_conf(self):
        return self.elasticsearch_conf

    def get_hdfs_namenode(self):
        return self.hdfs_namenode

    def initialize_spark_session(self,appName):
        if self.spark_app == None :
            self.spark_app = (SparkSession
                        .builder.master("spark://spark-master:7077")
                        .appName(appName)
                        .config("spark.jars","/elasticsearch-hadoop-7.15.1.jar")
                        .config("spark.driver.extraClassPath","/elasticsearch-hadoop-7.15.1.jar")
                        .config("spark.es.nodes",self.elasticsearch_conf["es.nodes"])
                        .config("spark.es.port",self.elasticsearch_conf["es.port"])
                        .config("spark.es.nodes.wan.only",self.elasticsearch_conf["es.nodes.wan.only"])
                        .getOrCreate())
        return self.spark_app

def save_dataframes_to_hdfs(path,config,data_dfs,target_file_names):
    """
        Function to store dataframe in hdfs
        
        Input:
        
        path: the directory path to store dataframe to
        config: Config object
        data_dfs: list of PySpark DataFrames to write
        target_file_names: list of file names to store dataframes by        
    """

    for data_df,target_file_name in zip(data_dfs,target_file_names):
        print("Processing file: ",target_file_name)
        print("Processing dataframe of type ",type(data_df))
        data_df.write.on.mode("overwrite").save(config.get_hdfs_namenode()+"/"+path+"/"+target_file_name)


schema = StructType([
      StructField("name",StringType(),True),
      StructField("Mô tả công việc",StringType(),True),
      StructField("Yêu cầu ứng viên",StringType(),True),
      StructField("Quyền lợi",StringType(),True),
      StructField("Cách thức ứng tuyển",StringType(),True)
  ])

def extract_framework_plattform(mo_ta_cong_viec,yeu_cau_ung_vien):
    return [framework for framework in patterns.framework_plattforms if re.search(framework, mo_ta_cong_viec + " " + yeu_cau_ung_vien, re.IGNORECASE)]

def extract_language(mo_ta_cong_viec,yeu_cau_ung_vien):
    return [language for language in patterns.languages if re.search(language.replace("+", "\+").replace("(", "\(").replace(")", "\)"), mo_ta_cong_viec + " " + yeu_cau_ung_vien, re.IGNORECASE)]

def extract_knowledge(mo_ta_cong_viec,yeu_cau_ung_vien):
    return [knowledge for knowledge in patterns.knowledges if re.search(knowledge, mo_ta_cong_viec + " " + yeu_cau_ung_vien, re.IGNORECASE)]

if __name__ == "__main__":
    
    APP_NAME="PreprocessData"
    
    app_config = Config(elasticsearch_host="elasticsearch",
                               elasticsearch_port="9200",
                               elasticsearch_input_json="yes",
                               elasticsearch_nodes_wan_only="true",
                               hdfs_namenode="hdfs://namenode:9000"
                               )
    spark = app_config.initialize_spark_session(APP_NAME)
    sc = spark.sparkContext
    
    raw_recruit_df = spark.read.schema(schema).option("multiline","true").json("hdfs://namenode:9000/data/rawdata/*.json")
    # raw_recruit_df.show(5)
    extracted_recruit_df=raw_recruit_df.select(raw_recruit_df["name"].alias("CompanyName"),
          extract_framework_plattform("Mô tả công việc","Yêu cầu ứng viên").alias("FrameworkPlattforms"),
          extract_language("Mô tả công việc","Yêu cầu ứng viên").alias("Languages"),
          extract_knowledge("Mô tả công việc","Yêu cầu ứng viên").alias("Knowledges"),
          )
    extracted_recruit_df.cache()
    # extracted_recruit_df.show(5)

    ##========save extracted_recruit_df to hdfs========================
    df_to_hdfs=(extracted_recruit_df,)
    df_hdfs_name = ("extracted_recruit.json",)
    save_dataframes_to_hdfs("data/extracteddata", app_config, df_to_hdfs, df_hdfs_name)


IllegalArgumentException: java.net.UnknownHostException: namenode