In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
import findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

findspark.init()

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
from pyspark import SparkConf
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

config = SparkConf().setMaster('local').setAppName('lab51')
spark = SparkSession.builder.config(conf=config).getOrCreate()
sc = spark.sparkContext

DATASET_PATH = '/content/gdrive/MyDrive/apache_logs.txt'

In [None]:
file_df = spark.read.text(DATASET_PATH)
file_df.printSchema()

root
 |-- value: string (nullable = true)




Hàm thực hiện chuyển đổi kiểu dữ liệu

In [None]:
log_reg = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+) "(\S+)" "([^"]*)'

logs_df = file_df.select(regexp_extract('value', log_reg, 1).alias('ip'),
                          regexp_extract('value', log_reg, 4).alias('date'),
                          regexp_extract('value', log_reg, 6).alias('request'),
                          regexp_extract('value', log_reg, 10).alias('referrer'))

logs_df.printSchema()
logs_df.show()

Đếm số lần domain xuất hiện trong file log. Chú ý là bạn phải lọc các trường referrer trống khỏi kết quả và chỉ lấy domain của web đó.

In [None]:
logs_df \
    .where("trim(referrer) != '-'") \
    .withColumn("referrer", substring_index('referrer', "/", 3)) \
    .groupBy('referrer') \
    .count() \
    .show(100, truncate=False)

+--------------------------------------+-----+
|referrer                              |count|
+--------------------------------------+-----+
|http://ijavascript.cn                 |1    |
|http://www.google.co.tz               |1    |
|http://www.google.ca                  |6    |
|https://www.google.hr                 |2    |
|https://www.google.ch                 |1    |
|http://www.google.ru                  |6    |
|http://www.raspberrypi-spanish.es     |1    |
|http://semicomplete.com               |2001 |
|http://manpages.ubuntu.com            |2    |
|http://kufli.blogspot.fr              |1    |
|http://www.bing.com                   |6    |
|http://rungie.com                     |1    |
|http://www.google.co.th               |2    |
|https://www.google.cz                 |5    |
|http://danceuniverse.ru               |3    |
|http://www.google.co.uk               |14   |
|http://www.google.rs                  |1    |
|http://kufli.blogspot.in              |1    |
|http://t.co 