In [1]:
import os
import re
import pandas as pd

In [2]:
os.chdir("..")
base_path = os.getcwd() 

In [3]:
from pyspark.sql.functions import length, col, size

In [4]:
pwd

'/Users/speedy/Desktop/spark_examples'

In [5]:
files_path = base_path + "/00_input/_profiles_txt/file-0*"

In [6]:
data_df = spark.read.text("file://" + files_path)

In [7]:
data_pd = data_df.toPandas()

In [8]:
import string

def count_text(text: str) -> int:
    return len(text) - text.count(" ")

def clean_text(text: str) -> str:
    less_space = re.sub("\s\s", " ", text.strip())
    return "".join([char for char in text if char.isalnum() or char == " "]).lower()

In [9]:
data_pd['text_len'] = data_pd['value'].apply(lambda x: count_text(x))

In [10]:
data_pd['clear_text'] = data_pd['value'].apply(lambda x: clean_text(x))

In [11]:
data_pd.head()

Unnamed: 0,value,text_len,clear_text
0,,0,
1,,0,
2,Professional Summary,19,professional summary
3,,0,
4,,0,


In [12]:
filtered_pd = data_pd[data_pd["text_len"] > 20].drop_duplicates(subset="clear_text", keep="first", inplace=False)

In [13]:
filtered_pd.head()

Unnamed: 0,value,text_len,clear_text
5,10 years of experience in data engineering and...,60,10 years of experience in data engineering and...
6,"Hadoop and RDBMS data pipelines, transformatio...",65,hadoop and rdbms data pipelines transformation...
7,a variety of data processing and transformatio...,60,a variety of data processing and transformatio...
11,▪ Performance tune Hadoop data systems and pi...,58,performance tune hadoop data systems and pip...
12,system configuration and processing using d...,62,system configuration and processing using d...


In [14]:
new_pd = filtered_pd["clear_text"]

In [15]:
new_pd.head()

5     10 years of experience in data engineering and...
6     hadoop and rdbms data pipelines transformation...
7     a variety of data processing and transformatio...
11      performance tune hadoop data systems and pip...
12       system configuration and processing using d...
Name: clear_text, dtype: object

In [16]:
new_df = spark.createDataFrame(filtered_pd)

In [17]:
profile_lines = new_df.select("clear_text")
profile_lines.show(10, False)

+-------------------------------------------------------------------------+
|clear_text                                                               |
+-------------------------------------------------------------------------+
|10 years of experience in data engineering and analytics working with    |
|hadoop and rdbms data pipelines transformation and cleansing  skilled in |
|a variety of data processing and transformation tools and data storage   |
|  performance tune hadoop data systems and pipelines with optimized      |
|   system configuration and processing using data processing tools hadoop|
|   cycle apache camel apache flume kafka apatar clover and others        |
|  worked with systems employing hive rdds dataframes                     |
|  forensic analysis with large complex data sets using realtime          |
|   analytics and distributed big data platforms                          |
|  comfortable working with hadoop distributions cloudera cloudera        |
+-----------

In [18]:
profile_lines.filter(col("clear_text").contains("forensic")).show(20, False)

+----------------------------------------------------------------------+
|clear_text                                                            |
+----------------------------------------------------------------------+
|  forensic analysis with large complex data sets using realtime       |
|     skilled in data analysis and forensic analysis using hadoop tools|
|     skilled in forensic methods of data cleaning and refining data   |
|     internal forensic incident response and threat hunting using sans|
|candidate  gcfe  giac forensic examiner certification                 |
+----------------------------------------------------------------------+



In [19]:
# profile_lines.filter(col("clear_text").contains("airflow")).show(10, False)

In [20]:
profile_lines.filter(col("clear_text").contains("python")).take(20)

[Row(clear_text='apache spark python sql oracle   cloudera hortonworks ms azure    '),
 Row(clear_text='      spark python and kinesis'),
 Row(clear_text='     wrote python scripts in jupyter to loop through daily dynamic'),
 Row(clear_text='     wrote python scripts in jupyter to pull information security logs'),
 Row(clear_text='     created python functions to parse cefformatted information security'),
 Row(clear_text='     integrated disparate versions of pythonspark to update older code'),
 Row(clear_text='     design and develop etl workflows using python and scala for processing'),
 Row(clear_text='     developed an automated employee access audit with powershell python'),
 Row(clear_text='     application logging development for elk stack with python for'),
 Row(clear_text='working extensively on hive sqoop pig and python'),
 Row(clear_text='creation of udf functions in python or scala'),
 Row(clear_text='technologies mapr hadoophdfs hive hbase spark pig java spring java ee jav