In [1]:
#
###### https://github.com/LucaCanali/Miscellaneous/blob/master/Pyspark_SQL_Magic_Jupyter/IPython_Pyspark_SQL_Magic.py
#
#
# IPython magic functions to use with Pyspark and Spark SQL
# The following code is intended as examples of shorcuts to simplify the use of SQL in pyspark
# The defined functions are:
#
# %sql <statement>          - return a Spark DataFrame for lazy evaluation of the SQL
# %sql_show <statement>     - run the SQL statement and show max_show_lines (50) lines
# %sql_display <statement>  - run the SQL statement and display the results using a HTML table 
#                           - this is implemented passing via Pandas and displays up to max_show_lines (50)
# %sql_explain <statement>  - display the execution plan of the SQL statement
#
# Use: %<magic> for line magic or %%<magic> for cell magic.
#
# Author: Luca.Canali@cern.ch
# September 2016
#

from IPython.core.magic import register_line_cell_magic

# Configuration parameters
max_show_lines = 50         # Limit on the number of lines to show with %sql_show and %sql_display
detailed_explain = True     # Set to False if you want to see only the physical plan when running explain


@register_line_cell_magic
def sql(line, cell=None):
    "Return a Spark DataFrame for lazy evaluation of the sql. Use: %sql or %%sql"
    val = cell if cell is not None else line 
    return sqlContext.sql(val)

@register_line_cell_magic
def sql_show(line, cell=None):
    "Execute sql and show the first max_show_lines lines. Use: %sql_show or %%sql_show"
    val = cell if cell is not None else line 
    return sqlContext.sql(val).show(max_show_lines) 

@register_line_cell_magic
def sql_display(line, cell=None):
    """Execute sql and convert results to Pandas DataFrame for pretty display or further processing.
    Use: %sql_display or %%sql_display"""
    val = cell if cell is not None else line 
    return sqlContext.sql(val).limit(max_show_lines).toPandas() 

@register_line_cell_magic
def sql_explain(line, cell=None):
    "Display the execution plan of the sql. Use: %sql_explain or %%sql_explain"
    val = cell if cell is not None else line 
    return sqlContext.sql(val).explain(detailed_explain)


In [2]:
##############################
###### Load The Delta   ######
##############################
###
### Input delta in folder :  /data/delta
job_dir="/home/notebookuser/notebooks/cv-ngrams-classifier/job-dir/"
json_cv_file=job_dir+"data/delta/json-cv-pdf"
json_cv_table="pdf_cv"
#
ngrams_cv_file=job_dir+"data/delta/cv-files-ngrams"
ngrams_cv_table="ngrams_cv"
#
skills_file=job_dir+"data/delta/role_skills"
skills_table="role_skills"
###
######
##############################Execution##########################
import findspark
findspark.init()
#
#
import pyspark
from pyspark.sql import functions as pfunc
from pyspark.sql import SQLContext
from pyspark.sql import Window, types
import re
import pandas as pd
import numpy as np
from pandas import DataFrame
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
from pyspark.sql.functions import udf
from pyspark.sql.functions import *
from scipy.stats import kstest
from scipy import stats
#
import subprocess
#
sc = pyspark.SparkContext(appName="Daily_CV_Analysis-Delta")
sqlContext = SQLContext(sc)
#

#
# Join with Internal Curation Data in urltopredict staged folder
from pyspark.sql import functions as F
### use version=1
version=1
## .option("versionAsOf", version)
delta_df1=sqlContext.read.format("delta").load(json_cv_file)\
.persist(pyspark.StorageLevel.MEMORY_AND_DISK_2)
delta_df1.printSchema()
delta_df1.registerTempTable(json_cv_table)
#
delta_df2=sqlContext.read.format("delta").load(ngrams_cv_file)\
.persist(pyspark.StorageLevel.MEMORY_AND_DISK_2)
delta_df2.printSchema()
delta_df2.registerTempTable(ngrams_cv_table)
#
delta_df3=sqlContext.read.format("delta").load(skills_file)\
.persist(pyspark.StorageLevel.MEMORY_AND_DISK_2)
delta_df3.printSchema()
delta_df3.registerTempTable(skills_table)
#
print("Table Loading Done")
#

root
 |-- filename: string (nullable = true)
 |-- pages: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- p_content: string (nullable = true)
 |    |    |-- page_n: string (nullable = true)

root
 |-- Filename: string (nullable = true)
 |-- pagei: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 4_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 5_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 6_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_counts: vector (nullable = true)
 |-- 2_counts: vector (nullable = true)
 |-- 3_counts: vector (nullable = true)
 |-- 4

In [3]:
#
####
#### Expose most frequent Terms in CV pages
####
from pyspark.sql.functions import *
from pyspark.sql.types import *
#
mywords=sqlContext.sql("select filename,pages from pdf_cv where filename IS NOT NULL ") ## ='cv-x1' 
mywords=mywords.select("filename",explode("pages.p_content").alias("p_cont"))\
.select("filename",explode(split(col("p_cont"), "\s+")).alias("terms_in_pages"))
###
mywords.printSchema()
####
filler_words_list=['the','a','of','to','is','or','in','on','for','by','an','The','the','and','A','at','as','it','be',\
                   'your','new','my','via','that','when','their','with','you','are','It','from','can','usually',\
                   'end','up','low','was','use','find','other','Other',\
                   'i','o','a','--','-',':','•','|','●','§','&','–','.','_',';',',','(',')','/',\
                   '1','2','3','4','5','6','7','8','9','0',\
                   'a','b','c','d','e','f','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',\
                   '○','+','-','_','---','--','##','##','###','####','â€“','â€','€“','â','€“','“','',\
                   'â€”','â','€','”','€”','â€','â”',\
                   ' ','\n','\n ','  ','\n  ','   ','\n   ','    ','     ','      ','       ','        ','         ','          ','           ']
####
wordCountDF = mywords.filter(~(col("terms_in_pages").isin(filler_words_list))).groupBy("filename","terms_in_pages").count().orderBy(col('count').desc())
####        
wordCountDF.show(200)
#

root
 |-- filename: string (nullable = true)
 |-- terms_in_pages: string (nullable = true)

+--------------------+--------------------+-----+
|            filename|      terms_in_pages|count|
+--------------------+--------------------+-----+
|CV-JoaoCerqueira-...|                Data|   83|
|CV-JoaoCerqueira-...|               using|   42|
|CV-JoaoCerqueira-...|                data|   26|
|CV-JoaoCerqueira-...|                 AWS|   24|
|CV-JoaoCerqueira-...|            solution|   21|
|CV-JoaoCerqueira-...|            services|   18|
|CV-JoaoCerqueira-...|             Science|   18|
|CV-JoaoCerqueira-...|                 Big|   15|
|CV-JoaoCerqueira-...|            Cloudera|   15|
|CV-JoaoCerqueira-...|                 ETL|   13|
|CV-JoaoCerqueira-...|              Oracle|   11|
|CV-JoaoCerqueira-...|              Python|    9|
|CV-JoaoCerqueira-...|             Perform|    9|
|CV-JoaoCerqueira-...|               Cloud|    9|
|CV-JoaoCerqueira-...|              Hadoop|    9|
|CV-Joao

In [4]:
#
data_analytics_df1=sqlContext.sql("select * from pdf_cv limit 5")
data_analytics_df1.printSchema()
data_analytics_df1.show(5)
#
#
data_analytics_df2=sqlContext.sql("select * from ngrams_cv limit 5")
data_analytics_df2.printSchema()
data_analytics_df2.show(5)
#
#
data_analytics_df3=sqlContext.sql("select * from role_skills limit 5")
data_analytics_df3.printSchema()
data_analytics_df3.show(5)
#
#
data_analytics_df4=sqlContext.sql(" select distinct(a.filename) from ngrams_cv as a, role_skills as b where b.role = 'devops engineer' AND b.level='5' AND b.skill = 'terraform' AND (array_contains(a.1_grams,b.skill)) limit 10 ")
data_analytics_df4.printSchema()
data_analytics_df4.show(5)
#
#

root
 |-- filename: string (nullable = true)
 |-- pages: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- p_content: string (nullable = true)
 |    |    |-- page_n: string (nullable = true)

+--------------------+--------------------+
|            filename|               pages|
+--------------------+--------------------+
|CV-JoaoCerqueira-...|[[Data Architectu...|
+--------------------+--------------------+

root
 |-- Filename: string (nullable = true)
 |-- pagei: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 4_grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 5_grams: array (nullable = true)
 |    |-- element: string (containsNull

In [5]:
%%sql_display
select distinct(a.filename) from ngrams_cv as a, role_skills as b 
 where b.role = 'devops engineer' AND b.level='5' AND b.skill = 'terraform' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 10

Unnamed: 0,filename
0,CV-JoaoCerqueira-2021-2022-withcerts


In [6]:
%%sql_display
select distinct(a.filename) from ngrams_cv as a, role_skills as b 
 where b.role = 'data engineer' AND b.level='5' AND b.skill = 'python' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 10

Unnamed: 0,filename
0,CV-JoaoCerqueira-2021-2022-withcerts


In [7]:
%%sql_display
select distinct(a.filename) from ngrams_cv as a, role_skills as b 
 where b.role = 'data engineer' AND b.level='5' AND b.skill = 'scala' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 10

Unnamed: 0,filename
0,CV-JoaoCerqueira-2021-2022-withcerts


In [8]:
%%sql_display
select a.filename, b.skill from ngrams_cv as a, role_skills as b 
 where b.role = 'data engineer' AND b.level='5' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 100

Unnamed: 0,filename,skill
0,CV-JoaoCerqueira-2021-2022-withcerts,gcp
1,CV-JoaoCerqueira-2021-2022-withcerts,dataproc
2,CV-JoaoCerqueira-2021-2022-withcerts,azure
3,CV-JoaoCerqueira-2021-2022-withcerts,aws
4,CV-JoaoCerqueira-2021-2022-withcerts,big data
5,CV-JoaoCerqueira-2021-2022-withcerts,hadoop
6,CV-JoaoCerqueira-2021-2022-withcerts,spark
7,CV-JoaoCerqueira-2021-2022-withcerts,cloudera
8,CV-JoaoCerqueira-2021-2022-withcerts,cdh
9,CV-JoaoCerqueira-2021-2022-withcerts,cdp


In [9]:
%%sql_display
select distinct(b.skill) from ngrams_cv as a, role_skills as b 
 where ( b.role like '%engineer' or b.role like '%architect')  AND b.level='5' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 100

Unnamed: 0,skill
0,kubernetes
1,kappa architecture
2,athena
3,azure
4,graphql
5,soa
6,eks
7,big data
8,gcp
9,api


In [10]:
%%sql_display
select distinct(b.skill) from ngrams_cv as a, role_skills as b 
 where b.role = 'data engineer' AND b.level='5' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 100

Unnamed: 0,skill
0,athena
1,azure
2,big data
3,gcp
4,kafka
5,glue
6,cdh
7,confluent kafka
8,cdp
9,aws glue


In [11]:
%%sql_display
select distinct(b.skill) from ngrams_cv as a, role_skills as b 
 where b.role = 'data architect' AND b.level='5' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 100

Unnamed: 0,skill
0,azure
1,gcp
2,graph
3,redshift
4,aws
5,oracle


In [12]:
%%sql_display
select distinct(b.skill) from ngrams_cv as a, role_skills as b 
 where b.role = 'solution architect' AND b.level='5' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 100

Unnamed: 0,skill
0,kubernetes
1,kappa architecture
2,graphql
3,soa
4,eks
5,api
6,gke
7,lambda architecture
8,aks


In [13]:
%%sql_display
select distinct(b.skill) from ngrams_cv as a, role_skills as b 
 where b.role = 'devops engineer' AND b.level='5' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 100

Unnamed: 0,skill
0,terraform
1,bash
2,scripting


In [14]:
%%sql_display
select distinct(b.skill) from ngrams_cv as a, role_skills as b 
 where b.role = 'data scientist' AND b.level='5' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
 limit 100

Unnamed: 0,skill
0,data science
1,pyspark
2,r


In [15]:
%%sql_display
select *  from ngrams_cv as a, role_skills as b 
 where b.role = 'data engineer' AND b.level='3' AND b.skill = 'python' 
  AND (array_contains(a.1_grams,b.skill) or array_contains(a.2_grams,b.skill) or array_contains(a.3_grams,b.skill))
limit 10

Unnamed: 0,Filename,pagei,1_grams,2_grams,3_grams,4_grams,5_grams,6_grams,1_counts,2_counts,3_counts,4_counts,5_counts,6_counts,features,skill,role,level
0,CV-JoaoCerqueira-2021-2022-withcerts,"[business, or, sector, consultancy, services, ...","[business, or, sector, consultancy, services, ...","[business or, or sector, sector consultancy, c...","[business or sector, or sector consultancy, se...","[business or sector consultancy, or sector con...","[business or sector consultancy services, or s...","[business or sector consultancy services via, ...","(0.0, 19.0, 12.0, 8.0, 7.0, 5.0, 2.0, 0.0, 6.0...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 19.0, 12.0, 8.0, 7.0, 5.0, 2.0, 0.0, 6.0...",python,data engineer,3
1,CV-JoaoCerqueira-2021-2022-withcerts,"[jenkins, jupyter, python, pandas, postgresql,...","[jenkins, jupyter, python, pandas, postgresql,...","[jenkins jupyter, jupyter python, python panda...","[jenkins jupyter python, jupyter python pandas...","[jenkins jupyter python pandas, jupyter python...","[jenkins jupyter python pandas postgresql, jup...",[jenkins jupyter python pandas postgresql apac...,"(0.0, 17.0, 14.0, 10.0, 12.0, 9.0, 10.0, 3.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 17.0, 14.0, 10.0, 12.0, 9.0, 10.0, 3.0, ...",python,data engineer,3
2,CV-JoaoCerqueira-2021-2022-withcerts,"[azure, cloud, using, services, of, azure, sql...","[azure, cloud, using, services, of, azure, sql...","[azure cloud, cloud using, using services, ser...","[azure cloud using, cloud using services, usin...","[azure cloud using services, cloud using servi...","[azure cloud using services of, cloud using se...","[azure cloud using services of azure, cloud us...","(0.0, 18.0, 16.0, 21.0, 16.0, 3.0, 11.0, 4.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 18.0, 16.0, 21.0, 16.0, 3.0, 11.0, 4.0, ...",python,data engineer,3
3,CV-JoaoCerqueira-2021-2022-withcerts,"[1, design, of, processes, and, setup, of, ser...","[1, design, of, processes, and, setup, of, ser...","[1 design, design of, of processes, processes ...","[1 design of, design of processes, of processe...","[1 design of processes, design of processes an...","[1 design of processes and, design of processe...","[1 design of processes and setup, design of pr...","(0.0, 25.0, 16.0, 25.0, 25.0, 7.0, 11.0, 4.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 25.0, 16.0, 25.0, 25.0, 7.0, 11.0, 4.0, ...",python,data engineer,3
4,CV-JoaoCerqueira-2021-2022-withcerts,"[3, 0, x, 15, designing, of, etl, applications...","[3, 0, x, 15, designing, of, etl, applications...","[3 0, 0 x, x 15, 15 designing, designing of, o...","[3 0 x, 0 x 15, x 15 designing, 15 designing o...","[3 0 x 15, 0 x 15 designing, x 15 designing of...","[3 0 x 15 designing, 0 x 15 designing of, x 15...","[3 0 x 15 designing of, 0 x 15 designing of et...","(0.0, 29.0, 16.0, 10.0, 10.0, 13.0, 13.0, 3.0,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 29.0, 16.0, 10.0, 10.0, 13.0, 13.0, 3.0,...",python,data engineer,3
5,CV-JoaoCerqueira-2021-2022-withcerts,"[with, scripting, in, pig, hiveql, and, scala,...","[with, scripting, in, pig, hiveql, and, scala,...","[with scripting, scripting in, in pig, pig hiv...","[with scripting in, scripting in pig, in pig h...","[with scripting in pig, scripting in pig hiveq...","[with scripting in pig hiveql, scripting in pi...","[with scripting in pig hiveql and, scripting i...","(0.0, 25.0, 8.0, 8.0, 11.0, 16.0, 4.0, 2.0, 5....","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 25.0, 8.0, 8.0, 11.0, 16.0, 4.0, 2.0, 5....",python,data engineer,3
6,CV-JoaoCerqueira-2021-2022-withcerts,"[1, the, fundamentals, of, cloudera, big, data...","[1, the, fundamentals, of, cloudera, big, data...","[1 the, the fundamentals, fundamentals of, of ...","[1 the fundamentals, the fundamentals of, fund...","[1 the fundamentals of, the fundamentals of cl...","[1 the fundamentals of cloudera, the fundament...","[1 the fundamentals of cloudera big, the funda...","(0.0, 20.0, 10.0, 12.0, 5.0, 10.0, 5.0, 2.0, 3...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 20.0, 10.0, 12.0, 5.0, 10.0, 5.0, 2.0, 3...",python,data engineer,3
7,CV-JoaoCerqueira-2021-2022-withcerts,"[paradigms, knowledge, representation, artific...","[paradigms, knowledge, representation, artific...","[paradigms knowledge, knowledge representation...","[paradigms knowledge representation, knowledge...",[paradigms knowledge representation artificial...,[paradigms knowledge representation artificial...,[paradigms knowledge representation artificial...,"(0.0, 20.0, 2.0, 14.0, 8.0, 18.0, 4.0, 0.0, 10...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 20.0, 2.0, 14.0, 8.0, 18.0, 4.0, 0.0, 10...",python,data engineer,3


In [16]:
%%sql_display
select * from ngrams_cv

Unnamed: 0,Filename,pagei,1_grams,2_grams,3_grams,4_grams,5_grams,6_grams,1_counts,2_counts,3_counts,4_counts,5_counts,6_counts,features
0,CV-JoaoCerqueira-2021-2022-withcerts,"[data, architectures, data, engineering, data,...","[data, architectures, data, engineering, data,...","[data architectures, architectures data, data ...","[data architectures data, architectures data e...","[data architectures data engineering, architec...","[data architectures data engineering data, arc...",[data architectures data engineering data scie...,"(0.0, 12.0, 27.0, 6.0, 3.0, 9.0, 5.0, 1.0, 2.0...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 12.0, 27.0, 6.0, 3.0, 9.0, 5.0, 1.0, 2.0..."
1,CV-JoaoCerqueira-2021-2022-withcerts,"[business, or, sector, consultancy, services, ...","[business, or, sector, consultancy, services, ...","[business or, or sector, sector consultancy, c...","[business or sector, or sector consultancy, se...","[business or sector consultancy, or sector con...","[business or sector consultancy services, or s...","[business or sector consultancy services via, ...","(0.0, 19.0, 12.0, 8.0, 7.0, 5.0, 2.0, 0.0, 6.0...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 19.0, 12.0, 8.0, 7.0, 5.0, 2.0, 0.0, 6.0..."
2,CV-JoaoCerqueira-2021-2022-withcerts,"[jenkins, jupyter, python, pandas, postgresql,...","[jenkins, jupyter, python, pandas, postgresql,...","[jenkins jupyter, jupyter python, python panda...","[jenkins jupyter python, jupyter python pandas...","[jenkins jupyter python pandas, jupyter python...","[jenkins jupyter python pandas postgresql, jup...",[jenkins jupyter python pandas postgresql apac...,"(0.0, 17.0, 14.0, 10.0, 12.0, 9.0, 10.0, 3.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 17.0, 14.0, 10.0, 12.0, 9.0, 10.0, 3.0, ..."
3,CV-JoaoCerqueira-2021-2022-withcerts,"[azure, cloud, using, services, of, azure, sql...","[azure, cloud, using, services, of, azure, sql...","[azure cloud, cloud using, using services, ser...","[azure cloud using, cloud using services, usin...","[azure cloud using services, cloud using servi...","[azure cloud using services of, cloud using se...","[azure cloud using services of azure, cloud us...","(0.0, 18.0, 16.0, 21.0, 16.0, 3.0, 11.0, 4.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 18.0, 16.0, 21.0, 16.0, 3.0, 11.0, 4.0, ..."
4,CV-JoaoCerqueira-2021-2022-withcerts,"[1, design, of, processes, and, setup, of, ser...","[1, design, of, processes, and, setup, of, ser...","[1 design, design of, of processes, processes ...","[1 design of, design of processes, of processe...","[1 design of processes, design of processes an...","[1 design of processes and, design of processe...","[1 design of processes and setup, design of pr...","(0.0, 25.0, 16.0, 25.0, 25.0, 7.0, 11.0, 4.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 25.0, 16.0, 25.0, 25.0, 7.0, 11.0, 4.0, ..."
5,CV-JoaoCerqueira-2021-2022-withcerts,"[3, 0, x, 15, designing, of, etl, applications...","[3, 0, x, 15, designing, of, etl, applications...","[3 0, 0 x, x 15, 15 designing, designing of, o...","[3 0 x, 0 x 15, x 15 designing, 15 designing o...","[3 0 x 15, 0 x 15 designing, x 15 designing of...","[3 0 x 15 designing, 0 x 15 designing of, x 15...","[3 0 x 15 designing of, 0 x 15 designing of et...","(0.0, 29.0, 16.0, 10.0, 10.0, 13.0, 13.0, 3.0,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 29.0, 16.0, 10.0, 10.0, 13.0, 13.0, 3.0,..."
6,CV-JoaoCerqueira-2021-2022-withcerts,"[with, scripting, in, pig, hiveql, and, scala,...","[with, scripting, in, pig, hiveql, and, scala,...","[with scripting, scripting in, in pig, pig hiv...","[with scripting in, scripting in pig, in pig h...","[with scripting in pig, scripting in pig hiveq...","[with scripting in pig hiveql, scripting in pi...","[with scripting in pig hiveql and, scripting i...","(0.0, 25.0, 8.0, 8.0, 11.0, 16.0, 4.0, 2.0, 5....","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 25.0, 8.0, 8.0, 11.0, 16.0, 4.0, 2.0, 5...."
7,CV-JoaoCerqueira-2021-2022-withcerts,"[1, the, fundamentals, of, cloudera, big, data...","[1, the, fundamentals, of, cloudera, big, data...","[1 the, the fundamentals, fundamentals of, of ...","[1 the fundamentals, the fundamentals of, fund...","[1 the fundamentals of, the fundamentals of cl...","[1 the fundamentals of cloudera, the fundament...","[1 the fundamentals of cloudera big, the funda...","(0.0, 20.0, 10.0, 12.0, 5.0, 10.0, 5.0, 2.0, 3...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 20.0, 10.0, 12.0, 5.0, 10.0, 5.0, 2.0, 3..."
8,CV-JoaoCerqueira-2021-2022-withcerts,"[paradigms, knowledge, representation, artific...","[paradigms, knowledge, representation, artific...","[paradigms knowledge, knowledge representation...","[paradigms knowledge representation, knowledge...",[paradigms knowledge representation artificial...,[paradigms knowledge representation artificial...,[paradigms knowledge representation artificial...,"(0.0, 20.0, 2.0, 14.0, 8.0, 18.0, 4.0, 0.0, 10...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 20.0, 2.0, 14.0, 8.0, 18.0, 4.0, 0.0, 10..."
9,CV-JoaoCerqueira-2021-2022-withcerts,"[cid, 1, cid, 2, cid, 3, cid, 4, cid, 5, cid, ...","[cid, 1, cid, 2, cid, 3, cid, 4, cid, 5, cid, ...","[cid 1, 1 cid, cid 2, 2 cid, cid 3, 3 cid, cid...","[cid 1 cid, 1 cid 2, cid 2 cid, 2 cid 3, cid 3...","[cid 1 cid 2, 1 cid 2 cid, cid 2 cid 3, 2 cid ...","[cid 1 cid 2 cid, 1 cid 2 cid 3, cid 2 cid 3 c...","[cid 1 cid 2 cid 3, 1 cid 2 cid 3 cid, cid 2 c...","(279.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16.0, 0....","(0.0, 0.0, 16.0, 16.0, 31.0, 31.0, 22.0, 22.0,...","(0.0, 0.0, 16.0, 31.0, 22.0, 0.0, 0.0, 21.0, 2...","(0.0, 0.0, 0.0, 0.0, 8.0, 8.0, 7.0, 7.0, 6.0, ...","(0.0, 0.0, 0.0, 8.0, 7.0, 6.0, 6.0, 6.0, 0.0, ...","(0.0, 0.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, ...","(279.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16.0, 0...."


In [17]:
sc.stop()