# A Simple Introduction to Web Scraping with Beautiful Soup

![](https://github.com/kaopanboonyuen/GISTDA2023/raw/main/img/gistda_day1.png)


Credit: 

[1] https://realpython.com/beautiful-soup-web-scraper-python/

[2] https://www.analyticsvidhya.com/blog/2021/08/a-simple-introduction-to-web-scraping-with-beautiful-soup/

[3] https://www.scrapingbee.com/blog/python-web-scraping-beautiful-soup/

# API Scraping using Twitter

![](https://www.techbooky.com/wp-content/uploads/2021/11/twitter-api.jpeg)

# IMPORT LIBS

In [None]:
#!pip install snscrape
!pip install --upgrade git+https://github.com/JustAnotherArchivist/snscrape.git
!pip install -q pythainlp
!pip install -q pythainlp

# DOWNLOAD THAI FONT FOR WORD CLOUD PLOT

In [None]:
!wget -q http://www.arts.chula.ac.th/ling/wp-content/uploads/TH-Sarabun_Chula1.1.zip -O font.zip
!unzip -qj font.zip TH-Sarabun_Chula1.1/THSarabunChula-Regular.ttf

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import pythainlp
from pythainlp.tokenize import word_tokenize
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
def thai_tokenizer(sentence):
    return word_tokenize(sentence, engine='newmm')

def remove_url(text):
    urlPattern = "((https?|ftp|gopher|telnet|file|Unsure|http):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)"
    text = re.sub(urlPattern, '', text)
    return text

def remove_rt(text):
    text = re.sub('^rt @[\\w]*: ', '', text).strip()
    return text

def remove_at(text):
    text = re.sub('@[\\w]*', '', text).strip()
    return text

def clean_text(text):
    text = text.lower().replace('\n', '').replace('\t', '')
    text = remove_url(text)
    text = remove_rt(text)
    text = remove_at(text)
    text = (text
            .replace(':', ' ')
            .replace(',', ' ')
            .replace('!', ' ')
            .replace('#', ' ')
            .replace('(', ' ')
            .replace(')', ' ')
            .replace('"', ' ')
            .replace("'", ' ')
            .replace('?', ' ')
            .replace('”', ' ')
            .replace("’", ' ')
           )
    text = re.sub(' +', ' ', text)
    text = re.sub('\.+', '\.', text)
    text = text.strip()
    
    return text

# TWITTER SCRAPING USING SNTWITTER WITH YOUR KEYWORD

In [None]:
# Set up the search query
search_term = "ลุงตู่"
since_date = "2022-01-01"
until_date = "2023-01-31"
#geocode = "13.736717,100.523186, 50km" # search within 50 km of bangkok

# Setting variables to be used below
maxTweets = 500

# Creating list to append tweet data to
tweets_list = []

# create the search query
query = f"{search_term} since:{since_date} until:{until_date}"

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):    
    if i > maxTweets:
        break
    tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.username])

In [None]:
# Creating a dataframe from the tweets list above
df = pd.DataFrame(tweets_list, columns=['datetime', 'tweet id', 'text', 'username'])

In [None]:
df.head()

In [None]:
df['clean_text'] = df['text'].apply(clean_text)
df

In [None]:
df[['text', 'clean_text']].head(3)

In [None]:
df['tokens'] = df['clean_text'].apply(word_tokenize)

tokens = df.explode('tokens')
word_count = tokens['tokens'].value_counts()

In [None]:
word_count.sort_values(ascending=False)

# FLATTEN LIST

In [None]:
flat_list = [item for sublist in df['tokens'].tolist()  for item in sublist]

In [None]:
word = []
for i in flat_list:
  if ' ' not in i:
    word.append(i)

# WORD CLOUD PLOT

In [None]:
# Create sample data
data = {'text': word}

# Convert data to pandas DataFrame
df_wc = pd.DataFrame(data)

# Get word frequencies using value_counts() method
word_freq = df_wc['text'].value_counts()

# Create word cloud object
wordcloud = WordCloud(font_path='THSarabunChula-Regular.ttf',width=800, height=800, background_color='white', colormap='inferno').generate_from_frequencies(word_freq)

# Display the generated wordcloud image
plt.figure(figsize=(8,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()