[![View on GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/khuyentran1401/Data-science/blob/master/nlp/diffbot_examples/data_scientist_vs_data_engineer_requirements.ipynb)

[<img src="https://deepnote.com/buttons/launch-in-deepnote.svg">](https://deepnote.com/project/Data-science-hxlyJpi-QrKFJziQgoMSmQ/%2FData-science%2Fnlp%2Fdiffbot_examples%2Fdata_scientist_vs_data_engineer_requirements.ipynb)

In [1]:
!pip install scattertext

## Process data

In [2]:
import pandas as pd

df = pd.read_pickle("processed_df.pkl")

## Terms

In [3]:
import scattertext as st

In [4]:
import re

In [5]:
analyze_col = "requirements"

In [6]:
filtered_df = df[~df[analyze_col].isna()][["title", analyze_col, "page_host"]]

In [7]:
filtered_df["parse"] = filtered_df[analyze_col].apply(st.whitespace_nlp_with_sentences)

In [8]:
filtered_df["parse"]

1       (2, +, years, of, relevant, experience, prefer...
2       (pursuing, a, phd, or, ms, in, cs, ,, math, ,,...
4       (phd, or, ms, in, computational, biology, ,, c...
5       (bachelor, â€™, s, degree, from, an, accredited,...
6       (strong, working, knowledge, of, a, variety, o...
                              ...                        
1990    (at, least, three, years, working, as, a, data...
1992                                         (li, -, pa1)
1994    (working, for, a, truly, global, company, with...
1995    (monthly, team, outings, (, ball, games, ,, ha...
1998    (at, least, 5, years, experience, developing, ...
Name: parse, Length: 864, dtype: object

In [9]:
type(
    st.CorpusFromParsedDocuments(
        filtered_df, category_col="title", parsed_col="parse"
    ).build()
)

scattertext.ParsedCorpus.ParsedCorpus

In [10]:
corpus = (
    st.CorpusFromParsedDocuments(filtered_df, category_col="title", parsed_col="parse")
    .build()
    .get_unigram_corpus()
    .compact(st.AssociationCompactor(2000))
)

In [11]:
term_freq_df = corpus.get_term_freq_df()

In [12]:
import nltk

nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [13]:
term_freq_df["Data Scientist Score"] = corpus.get_scaled_f_scores("data scientist")
term_freq_df["Data Engineer Score"] = corpus.get_scaled_f_scores("data engineer")

In [14]:
def is_noun(word: str):
    pos = nltk.pos_tag([word])[0][1]
    return pos[:2] == "NN"


term_freq_df = term_freq_df.loc[map(is_noun, term_freq_df.index)]

In [15]:
term_freq_df.sort_values(by="Data Scientist Score", ascending=False).index[:30]

Index(['science', 'analysis', 'machine', 'ability', 'computer', 'work', 's',
       'field', 'r', 'degree', 'statistics', 'quantitative', 'experience',
       'years', 'python', 'data', 'knowledge', 'techniques', 'business',
       'analytics', 'skills', 'bachelor', 'math', 'engineering', 'master',
       'mining', 'development', 'time', 'environment', 'languages'],
      dtype='object', name='term')

In [16]:
term_freq_df.sort_values(by="Data Engineer Score", ascending=False).index[:30]

Index(['self', 'spark', 'design', 'excellent', 'g', 'implement', 'java', 'sql',
       'scala', 'technologies', 'etc', 'e', 'dimensional', 'management',
       'hadoop', 'systems', 'hands', 'sets', 'growth', 'datasets', 'operating',
       'mapreduce', 'aws', 'industry', 'verbal', 'perfect', 'enemy', 'etl',
       'ambiguity', 'awareness'],
      dtype='object', name='term')

In [17]:
html = st.produce_scattertext_explorer(
    corpus,
    category="data scientist",
    category_name="Data scientist",
    not_category_name="Data Engineer",
    minimum_term_frequency=5,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    metadata=corpus.get_df()["page_host"],
    transform=st.Scalers.dense_rank,
)
open("data_science_vs_data_engineer_requirements_terms.html", "w").write(html)

1305469

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=87197226-98be-42b2-8527-389082831299' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>