In [1]:
%%configure -f
{"executorCores": 4, "executorMemory": "9486M", "conf": {"spark.default.parallelism": 1000}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
10,application_1565602868307_0011,pyspark,idle,Link,Link,


In [2]:
import pandas as pd

import gensim
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords
import string
from langdetect import detect
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType, ArrayType, DoubleType, MapType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
12,application_1565602868307_0013,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
df = spark.read.load("s3://onai-ml-dev-eu-west-1/web_crawler/data/urls_and_content")
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- company_id: string (nullable = true)
 |-- website: string (nullable = true)
 |-- description: string (nullable = true)
 |-- url: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- raw_content: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- country: string (nullable = true)

In [4]:
def check_language(text, content_type='html'):
    if content_type != "html":
        return ''
    
    try:
        return detect(text)
    except:
        return ''
    
check_language_udf = udf(check_language, StringType())

df_preprocess_lang = df.withColumn("text_lang", check_language_udf("raw_content", "content_type"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def process_text(text, text_lang):
    if text_lang != 'en':
        return ''
    
    text_no_stopwords = remove_stopwords(text)
    words = tokenize(text_no_stopwords, lower=True)
    return list(filter(lambda word: word not in string.punctuation and word.isalpha() and len(word) > 1, words))

process_text_udf = udf(process_text, ArrayType(StringType()))

df_preprocess_lang = df_preprocess_lang.withColumn("idx", F.monotonically_increasing_id())

df_preprocess = (df_preprocess_lang
                 .withColumn("tokenized_text", process_text_udf("raw_content", "text_lang"))
                 .withColumn('token', F.explode('tokenized_text'))
                )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
df_tokens = (df_preprocess.groupBy("token")
             .agg(F.countDistinct("idx").alias('df'))
             .filter(F.col('df') > 10)
            )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
#num_doc = df_preprocess_lang.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
import math
def calcIdf(doc_count, df):
    return math.log((doc_count+1)/(df+1))

calIdf_udf = udf(calcIdf, DoubleType())

df_idf = df_tokens.withColumn("idf", calIdf_udf(F.lit(df_preprocess_lang.count()), 'df'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
df_all_preprocess_tf = df_preprocess.groupBy("idx", "token").agg(F.count("tokenized_text").alias("tf"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
combineMap = udf(lambda maps: {key:f[key] for f in maps for key in f}, MapType(StringType(),DoubleType()))    

df_all_preprocess_tf_idf = (df_all_preprocess_tf
      .join(df_idf, "token", "left")
      .withColumn("tf_idf", F.col("tf") * F.col("idf"))
      .withColumn("tf_idf_token", F.create_map("token","tf_idf"))
      .groupBy("idx")
      .agg(F.collect_list("tf_idf_token").alias("tf_idf_text_list"))
      .withColumn("tf_idf_text", combineMap("tf_idf_text_list"))
      .drop("tf_idf_text_list")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
df_tf_idf = df_preprocess_lang.join(df_all_preprocess_tf_idf, "idx")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
df_tf_idf.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- idx: long (nullable = false)
 |-- company_id: string (nullable = true)
 |-- website: string (nullable = true)
 |-- description: string (nullable = true)
 |-- url: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- raw_content: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- text_lang: string (nullable = true)
 |-- tf_idf_text: map (nullable = true)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)

In [15]:
def process_too_long_text(text):
    sent = list(gensim.summarization.textcleaner.get_sentences(text))
    idx = min(len(sent), 3)
    return ' '.join(sent[0:idx])

def extract_description_pattern_is(text, company_name):
    # Extract description following pattern: <Company name> is ...
    description = ''
    if company_name in text:
        idx = text.find(company_name)
        potential_text = text[idx:idx + len(company_name) + 30]
        if ' is ' in potential_text:
            end_idx = len(text)
            for sym in ['*', '#', '|', '>']:
                sym_idx = text.find(sym, idx + len(company_name))
                if sym_idx != -1:
                    end_idx = min(end_idx, sym_idx)

            extracted = text[idx:end_idx]
            sent = list(gensim.summarization.textcleaner.get_sentences(extracted))
            extracted_desc = sent[0]
            if extracted_desc.split(' ')[-1] in {'Inc.', 'U.S.'} and len(sent) > 1:
                extracted_desc += ' ' + sent[1]
            if len(extracted_desc.split()) > 5:
                description = extracted_desc.replace('\n', ' ')
    return description


def extract_description_pattern_who_we_are(text):
    # Extract description following pattern: Who we are, ...
    description = ''
    text = text.split('\n')
    for idx, line in enumerate(text):
        for marker in ['# who we are', '# overview', '# company overview', '# about', '# mission',
                       '# our mission']:
            if marker in line.lower():
                start_idx = idx + 1
                while start_idx < len(text):
                    if text[start_idx] == '':
                        start_idx += 1
                    else:
                        break

                end_idx = start_idx + 1
                while end_idx < len(text):
                    if text[end_idx] != '':
                        end_idx += 1
                    else:
                        break

                if start_idx < len(text):
                    extracted = ' '.join(text[start_idx:end_idx])
                    extracted_desc = process_too_long_text(extracted)

                    extracted_desc_splits = extracted_desc.split()
                    if len(extracted_desc_splits) > 5 and extracted_desc_splits[0][0] != '*':
                        if extracted_desc_splits[0][0] == '#':
                            extracted_desc = ' '.join(extracted_desc_splits[1:])
                        description = extracted_desc
                        break
    return description

def extract_description(company_name, meta_description, text, text_lang):
    # check if the description from meta field is in English and has more than 3 tokens
    if check_language(meta_description) == 'en' and len(meta_description.split()) > 3:
        short_description = meta_description
    else:
        short_description = ''

    if text and text_lang == 'en':
        long_description = extract_description_pattern_is(text, company_name)
        if not long_description:
            long_description = extract_description_pattern_who_we_are(text)
    else:
        long_description = ''
        
    if not short_description:
        short_description = long_description
        long_description = ''
                
    return [short_description, long_description]

extract_description_udf = udf(extract_description, ArrayType(StringType()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
df_result = (df_tf_idf.withColumn("extracted_description", extract_description_udf("company_name", "description", "raw_content", "text_lang"))
                  .withColumn("short_description", F.col("extracted_description")[0])
                  .withColumn("long_description", F.col("extracted_description")[1])
                  .drop("extracted_description"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
df_result.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- idx: long (nullable = false)
 |-- company_id: string (nullable = true)
 |-- website: string (nullable = true)
 |-- description: string (nullable = true)
 |-- url: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- raw_content: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- text_lang: string (nullable = true)
 |-- tf_idf_text: map (nullable = true)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)
 |-- short_description: string (nullable = true)
 |-- long_description: string (nullable = true)

In [18]:
def select_description(urls, descriptions):
    sorted_zip = sorted(zip(urls, descriptions), key = lambda t: len(t[0]))
    for url, description in sorted_zip:
        if description:
            return description
    return ''
    
select_description_udf = udf(select_description, StringType())

from collections import Counter
def extract_keywords(tf_idf_list):
    counter = Counter()
    for tf_idf in tf_idf_list:
        for word, tf_idf in tf_idf.items():
            if tf_idf:
                counter[word] += tf_idf
            
    return [key for key, val in counter.most_common(10)]

extract_keywords_udf = udf(extract_keywords, ArrayType(StringType()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
df_new=(df_result
 .groupBy("company_id", "website", "company_name", "country")
 .agg(F.collect_list("url").alias("urls"), F.collect_list("short_description").alias("short_descriptions"),
      F.collect_list("long_description").alias("long_descriptions"),
      F.collect_list("tf_idf_text").alias("tf_idf_text_list"))
 .withColumn("description", select_description_udf("urls", "short_descriptions"))
 .withColumn("long_description", select_description_udf("urls", "long_descriptions"))
 .withColumn("keywords", extract_keywords_udf("tf_idf_text_list"))
 .drop("urls", "short_descriptions", "long_descriptions", "tf_idf_text_list")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
df_new.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- company_id: string (nullable = true)
 |-- website: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- description: string (nullable = true)
 |-- long_description: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)

In [21]:
(df_new
 .repartition(1000)
 .write
 .parquet("s3://ai-data-lake-dev-eu-west-1/staging/crawler/capiq/keywords", mode="overwrite")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
df_new = spark.read.load("s3://ai-data-lake-dev-eu-west-1/staging/crawler/capiq/keywords")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
df_new.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

235526

In [24]:
df_pd = df_new.limit(1000).toPandas()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
df_pd.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Index(['company_id', 'website', 'company_name', 'country', 'description',
       'long_description', 'keywords'],
      dtype='object')

In [26]:
with pd.option_context('display.max_rows', 500): 
    df_pd[['website', 'keywords']].iloc[0:500]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                                               website                                           keywords
0                                http://www.taymer.com  [cable, wire, length, measurement, inspection,...
1                               http://www.fieldav.com  [aircraft, aviation, avionics, missionization,...
2                http://www.newafricanproperties.co.bw  [nap, botswana, properties, bse, african, unde...
3                                   http://www.ily.com  [duplicator, hdd, sd, ily, dupe, usb, spartan,...
4          http://www.physicaldistributionservices.com  [allstate, labor, bloomington, delivery, mccoy...
5                     http://www.stlouisbootstores.com  [clothing, boots, mo, chaps, biker, exotics, b...
6                     http://www.adjacencypartners.com  [adjacency, tom, bryan, pdx, business, brand, ...
7                              http://www.octaware.com  [aslam, equire, technologies, software, crm, a...
8                              http://www.qtst

In [27]:
with pd.option_context('display.max_rows', 500): 
    df_pd[(df_pd['long_description'] == '') & (df_pd['keywords'].map(lambda d: len(d)) > 1)].iloc[0:500]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

    company_id  ...                                           keywords
0    276197806  ...  [cable, wire, length, measurement, inspection,...
2    139686059  ...  [nap, botswana, properties, bse, african, unde...
3     30747906  ...  [duplicator, hdd, sd, ily, dupe, usb, spartan,...
4      4470383  ...  [allstate, labor, bloomington, delivery, mccoy...
5    265953008  ...  [clothing, boots, mo, chaps, biker, exotics, b...
6    575037322  ...  [adjacency, tom, bryan, pdx, business, brand, ...
7    321216297  ...  [aslam, equire, technologies, software, crm, a...
8    145886213  ...  [qts, tf, kennametal, la, episode, shooting, t...
9    283177834  ...  [ca, repairs, contractor, repair, damage, main...
11     6946698  ...  [carpentry, masonry, performing, concrete, bud...
12   559506584  ...  [child, cranston, smithfield, school, prekinde...
13     7281652  ...  [xylem, uk, chelmsford, conducts, tennis, dunm...
14   127580557  ...  [branford, kiss, seating, ct, reviews, user, b...
15   3

In [28]:
len(df_pd[df_pd['keywords'].map(lambda d: len(d)) == 1])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2