In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from pprint import pprint
import re
from tqdm import tqdm
tqdm.pandas()
import numpy as np

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
#!pwd
import os

# Google Bucket
# file name checkpoint_0512_sent_split.parquet
path_bucket = 'gs://msca-sp23-bucket/nlp_data'
path_bucket_df = path_bucket + '/' + 'checkpoint_0512_sent_split.parquet'
runtime_path = '/home/anthony/projects/nlp_runtime'

os.chdir(runtime_path)
print(os.getcwd())

/home/anthony/projects/nlp_runtime


In [3]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

import warnings
warnings.filterwarnings("ignore")

In [4]:
#df.to_parquet('checkpoint_0512_sent_split.parquet', index=False)
df = pd.read_parquet(path_bucket, engine='pyarrow')
df.head(1)

Unnamed: 0,url,date,language,title,text,text_split_filtered
0,http://en.people.cn/n3/2021/0318/c90000-9830122.html,2021-03-18,en,Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online,"\n\nArtificial intelligence improves parking efficiency in Chinese cities - People's Daily Online\n\nHome\nChina Politics\nForeign Affairs\nOpinions\nVideo: We Are China\nBusiness\nMilitary\nWorld\nSociety\nCulture\nTravel\nScience\nSports\nPhoto\n\nLanguages\n\nChinese\nJapanese\nFrench\nSpanish\nRussian\nArabic\nKorean\nGerman\nPortuguese\nThursday, March 18, 2021\nHome>>\n\t\t\nArtificial intelligence improves parking efficiency in Chinese cities\nBy Liu Shiyao (People's Daily) 09:16, Mar...","[Chinese Japanese French Spanish Russian Arabic Korean German Portuguese Thursday, March 18, 2021 Home Artificial intelligence improves parking efficiency in Chinese cities By Liu Shiyao People's Daily 09:16, March 18, 2021 Photo taken on July 1, 2019, shows a sign for electronic toll collection ETC newly set up at a roadside parking space on Yangzhuang road, Shijingshan district, Beijing. Some urban areas of the city started to use ETC system for roadside parking spaces since July 1..."


In [5]:
df.shape

(385796, 6)

## Primary Filtering: Select Relevant Titles & Text

In [None]:
# use all titles to make a word cloud
df_title = df[['title']]
df_title.head()

### WordCloud on topics

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
#% matplotlib inline

In [None]:
# remove punctuations in title
df_title['title'] = df_title['title'].str.replace('[^\w\s]','')

# only keep alphabets
df_title['title'] = df_title['title'].str.replace('[^a-zA-Z]',' ')

In [None]:
text = " ".join(title for title in df.title)
print ("There are {} words in the combination of all review.".format(len(text)))

In [None]:
# ref: https://www.datacamp.com/tutorial/wordcloud-python

# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["drink", "now", "wine", "flavor", "flavors"])

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white", width=800, height=400).generate(text)

# Display the generated image:
# the matplotlib way (set dpi=100)
plt.figure(dpi=100)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Create a WordCloud object
wc = WordCloud(stopwords=stopwords, background_color="white")

# Generate the word frequencies from the text
word_frequencies = wc.process_text(text)

# Convert the word frequencies to a list of tuples
word_list = list(word_frequencies.items())

# Sort the word list by frequency
sorted_word_list = sorted(word_list, key=lambda x: x[1], reverse=True)

In [None]:
# check
print(len(sorted_word_list))
sorted_word_list[:10]

In [None]:
# visualze the top 100 words and frequencies using a horizontal barplot
import seaborn as sns

# create a dataframe
df_word = pd.DataFrame(sorted_word_list[:100], columns=['word', 'freq'])
df_word.head()

In [None]:
# plot
plt.figure(figsize=(10, 20))
sns.barplot(x='freq', y='word', data=df_word)
plt.xticks(rotation=90)
plt.show()

In [None]:
#df_word[df_word['word'] == 'Conversational AI']

### Filtering articles using title and text

In [None]:
# collect the top 22 words in the df_word; in lower cases
wc_keywords = df_word['word'].str.lower().tolist()[:15]
user_keywords = ['Big Data', 'Data Mining', 'Data Analytics', 'Data Visualization', 'Data Cleaning', 'Data Wrangling', 'Data Science', 
                 'Data Engineering', 'Data Governance', 'Data Security', 'Data Privacy', 'Data Ethics', 'Data Strategy', 'Data Operations', 
                 'Data Warehousing', 'Business Intelligence', 'Business Analytics', 'Predictive Analytics', 'Prescriptive Analytics', 
                 'Descriptive Analytics', 'Statistical Modeling', 'Machine Learning', 'Deep Learning', 'Neural Networks', 'Convolutional Neural Networks', 
                 'Recurrent Neural Networks', 'Generative Adversarial Networks', 'Natural Language Processing', 'Computer Vision', 'Image Processing', 
                 'Speech Recognition', 'Chatbots', 'Conversational AI', 'Autonomous Driving', 'Autonomous Car', 'Robotics', 'Reinforcement Learning', 'Transfer Learning', 'Model Deployment', 
                 'Model Monitoring', 'Model Interpretability', 'A/B Testing', 'Experimentation', 'Bias and Fairness in AI', 'Explainable AI', 'Human-in-the-Loop AI', 
                 'MLOps', 'CI/CD', 'Cloud Computing', 'Edge Computing', 'IoT', 'Blockchain', 'Privacy-Preserving Machine Learning', 'Federated Learning', 'Differential Privacy',
                 'Secure Multi-Party Computation', 'Homomorphic Encryption', 'Data Governance Framework', 'Data Cataloging', 'Data Lineage', 'Data Profiling', 'Data Virtualization', 
                 'Data Integration', 'Data Federation', 'Data Architecture', 'Data Modeling', 'Data Encryption', 'Data Masking', 'Data Compliance', 'Data Stewardship', 'Data Science Workflow', 
                 'Data Pipelines', 'Text Analytics', 'Text Mining', 'Sentiment Analysis', 'Speech-to-Text', 'Text-to-Speech', 'OCR', 'Object Detection', 'Semantic Segmentation', 'Predictive Maintenance', 'Recommendation Systems']

# convert to lower cases
user_keywords = [x.lower() for x in user_keywords]

# combine the two lists and convert to lower cases
filter_keywords = wc_keywords + user_keywords

In [None]:
# save filter keywords
with open('word_list.txt', 'w') as f:
    for word in filter_keywords:
        f.write(word + '\n')

In [None]:
# join the text_split_filtered back to a string
df['text_rejoined'] = df['text_split_filtered'].str.join(' ')

# change 'text_rejoined' and 'title' to str
df['text_rejoined'] = df['text_rejoined'].astype(str)
df['title'] = df['title'].astype(str)

In [None]:
# save a checkpoint
#df.to_parquet('checkpoint_0514_sent_split.parquet', index=False)
#df = pd.read_parquet('checkpoint_0514_sent_split.parquet', engine='pyarrow')

In [None]:
# with title and text_rejoined, using parallel processing and ignore case; only keep those rows that contain at least one keyword
'''#df_test = df.sample(1000)
df_selected = df[df['title'].str.contains('|'.join(filter_keywords), case=False) |
                      df['text_rejoined'].str.contains('|'.join(filter_keywords), case=False)]'''

# Using pandaraellel for faster processing
# Define the filtering function
def filter_rows(row, filter_keywords):
    for keyword in filter_keywords:
        if keyword.lower() in row['title'].lower() or keyword.lower() in row['text_rejoined'].lower():
            return True
    return False

# Apply the function in parallel
#df_test = df.sample(30)
df_selected = df[df.parallel_apply(filter_rows, args=(filter_keywords,), axis=1)]

In [None]:
# check the shape
print('initial no.of articles', len(df))
print('filtered no.of articles', len(df_selected))

# show top 5 rows
df_selected.head(2)

In [None]:
# check before drop
#df_selected.language.value_counts()

In [None]:
# save a copy of the original length of text so that we can compare later
raw_text_length = df_selected['text'].apply(lambda x: len(x))

In [None]:
# drop unwanted columns
df_selected = df_selected.drop(columns=['language', 'text'])
# rename columns
df_selected = df_selected.rename(columns={'text_rejoined': 'text', 'text_split_filtered': 'text_split'})

print(df_selected.shape)

In [None]:
df_selected.head(1)

In [None]:
# save a cleaned copy to bucket
#path_bucket_save = path_bucket + '/' + 'df_cleaned_0514.parquet'
#df_selected.to_parquet(path_bucket_save, index=False)

## More Cleaning

In [None]:
# Google Bucket
# file name checkpoint_0512_sent_split.parquet

path_bucket_df_cleaned = path_bucket + '/' + 'df_cleaned_0514.parquet'
runtime_path = '/home/anthony/projects/nlp_runtime'

os.chdir(runtime_path)
print(os.getcwd())

In [None]:
#df.to_parquet('checkpoint_0512_sent_split.parquet', index=False)
df = pd.read_parquet(path_bucket_df_cleaned, engine='pyarrow')
df.head(3)

In [None]:
df.info()

In [None]:
df.sample(5)[['url', 'title', 'text']]

The sample shows that we need more text cleaning:
- Long, meaningless word: `ABuybacksLegalInterviewsManagementOfferingsIPOsInsider TradesBiotech/FDAFreightPoliticsGovernmentHealthcareMarkets`
- 

In [None]:
import re

# Define the filtering function
def long_word_cleaner(text):
    pattern = r'\b\w{13,}\b|\b\w*[A-Z]{2,}\w*\b'
    filtered_text = re.sub(pattern, '', text)
    return filtered_text

# Apply the function in parallel
df['text_cleaned'] = df['text'].parallel_apply(lambda x: long_word_cleaner(x))

In [None]:
# load previous text length
df['raw_text_length'] = raw_text_length

# new text length
df['text_length'] = df['text_cleaned'].apply(lambda x: len(x))

# reduce ratio
df['reduce_ratio'] = (df['text_length'] / df['raw_text_length']) * 100

In [None]:
df[['raw_text_length', 'text_length', 'reduce_ratio']].describe()