In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("lab_01").getOrCreate()
sc = spark.sparkContext

sc

In [855]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType, ArrayType, DecimalType
from pyspark.sql.functions import explode, col, udf, count, pow

from urllib.parse import urlparse, unquote
import re

# Обработка логов

In [788]:
schema = StructType(
    [
        StructField("UID", LongType()),
        StructField("Timestamp", DoubleType()),
        StructField("URL", StringType())
    ]
)

df_logs = spark.read.format('csv') \
    .schema(schema) \
    .options(header=False, inferSchema=True, sep='\t') \
    .load('/mf-labs/laba01/logs')

In [455]:
df_logs.show(3)

+------------+----------------+--------------------+
|         UID|       Timestamp|                 URL|
+------------+----------------+--------------------+
|258131083251|1.423724291637E9|http%3A%2F%2Fwww....|
|258131083251|1.423724290637E9|                null|
|182984926014|1.423724291666E9|http%3A%2F%2Fwww....|
+------------+----------------+--------------------+
only showing top 3 rows



In [867]:
def parse_url(url):
    return re.sub('^www\.', '', urlparse(unquote(url)).netloc)

parse_url_udf = udf(parse_url, StringType())

df_logs_parsed = df_logs \
    .filter(col('URL').like('http%')) \
    .withColumn('domain', parse_url_udf('URL')) \
    .na.drop() 

df_logs_parsed.cache()

DataFrame[UID: bigint, Timestamp: double, URL: string, domain: string]

In [790]:
df_logs_parsed.show()

+------------+----------------+--------------------+-----------------+
|         UID|       Timestamp|                 URL|           domain|
+------------+----------------+--------------------+-----------------+
|258131083251|1.423724291637E9|http%3A%2F%2Fwww....|         zakon.kz|
|182984926014|1.423724291666E9|http%3A%2F%2Fwww....|       bmwclub.ru|
|182984926014|1.423724290666E9|http%3A%2F%2Fwww....|       bmwclub.ru|
|289380960531|1.423724291723E9|http%3A%2F%2Fwww....|       bmwclub.ru|
|289380960531|1.423724290723E9|http%3A%2F%2Fwww....|       bmwclub.ru|
| 85356233460|1.423724290923E9|http%3A//www.wmma...|        wmmail.ru|
|204109491902|1.423724291935E9|https%3A%2F%2Fvk....|           vk.com|
|204109491902|1.423724290935E9|https%3A%2F%2Fvk....|           vk.com|
|302580370974|1.423724291839E9|http%3A%2F%2Fwww....|  novayagazeta.ru|
|160372190500|1.423724291901E9|http%3A%2F%2Fforu...|forum.krasmama.ru|
|160372190500|1.423724290901E9|http%3A%2F%2Fforu...|forum.krasmama.ru|
|31013

# Обработка пользователей авто

In [791]:
schema = StructType(
    [
        StructField("autousers", ArrayType(StringType()))
    ]
)

df_autousers = spark.read.format('json') \
    .schema(schema) \
    .load('/mf-labs/laba01/autousers.json')

In [792]:
df_autousers = df_autousers \
    .select(explode(col('autousers')).alias('UID')) \
    .select(col('UID').cast('long'))

df_autousers.cache()

DataFrame[UID: bigint]

In [793]:
df_autousers.show(3)

+------------+
|         UID|
+------------+
|100341861572|
|100473724387|
|100528753939|
+------------+
only showing top 3 rows



# Расчет вероятностей

In [794]:
df_netloc_cnt_all = df_logs_parsed \
    .groupby('domain') \
    .count() \
    .withColumnRenamed('count', 'cnt_all')

df_logs_cnt_auto = df_logs_parsed \
    .join(df_autousers, 'UID', how='inner') \
    .groupby('domain') \
    .count() \
    .withColumnRenamed('count', 'cnt_auto')

df_logs_cnt = df_netloc_cnt_all \
    .join(df_logs_cnt_auto, on='domain', how='left') \
    .na.fill({'cnt_auto': 0})

In [870]:
df_logs_cnt.show(5)

+--------------------+-------+--------+
|              domain|cnt_all|cnt_auto|
+--------------------+-------+--------+
|     100bestpoems.ru|     53|       0|
|          100pdf.net|      7|       0|
|              4i5.ru|      3|       0|
|             4ppc.ru|      6|       0|
|537484ab4f5af8717...|      2|       0|
+--------------------+-------+--------+
only showing top 5 rows



In [795]:
cnt_all, cnt_auto = df_logs_cnt.groupby().agg({'cnt_all': 'sum', 'cnt_auto':'sum'}).collect()[0]

In [796]:
cnt_all, cnt_auto

(6571038, 313527)

In [861]:
koef = pow(col('cnt_auto'), 2) / (col('cnt_all') * cnt_auto)

laba01_domains = df_logs_cnt \
    .select(col('domain'), koef.cast(DecimalType(20,20)).alias('relevance')) \
    .sort(col('relevance').desc(), col('domain').asc()) \
    .limit(200) \
    .toPandas()

In [859]:
laba01_domains.head(20)

Unnamed: 0,domain,relevance
0,avto-russia.ru,0.3592553907657191
1,bmwclub.ru,0.3099862002759807
2,cars.ru,0.2066429146924646
3,passat-b5.ru,0.0025786465832002
4,auto.yandex.ru,0.0025710650150413
5,as8.ru,0.0014895048911258
6,avtogermes.ru,0.001294912463552
7,club-fx.ru,0.0011310103204102
8,spravka003.ru,0.0010179255603094
9,car-total.ru,0.0008961813469982


In [864]:
laba01_domains.to_csv('laba01_domains.txt', sep='\t', index=False, header=False)

In [25]:
spark.stop()

# Podval

In [89]:
# def get_file_list(path, sc=sc):
#     """получение списка файлов в папке"""

#     hadoop = sc._jvm.org.apache.hadoop
#     fs = hadoop.fs.FileSystem
#     conf = hadoop.conf.Configuration()

#     path = hadoop.fs.Path(path)
#     file_list = [str(f.getPath()).rsplit('/', 1)[1] for f in fs.get(conf).listStatus(path) if not f.isDirectory()]

#     return file_list


# import subprocess, re
# cmd = 'hdfs dfs -ls /mf-labs/laba01/logs'
# pattern = ' (/.+)'
# files = os.popen(cmd).read().strip().split('\n')
# files = list(filter(lambda x: re.search(pattern, x), files))
# files = list(map(lambda x: re.search(pattern, x).group(1).rsplit('/', 1)[1], files))
# files