# ML final project
In this project, I will handle the final project of machine learning course.

In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import json
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
from glob import glob
from tqdm import tqdm, trange
from typing import List, Dict, Any, Optional
from traceback import print_exception

## Data collecting
At first, we need to collect data as much as I can.

`stweet` is the scraper I used to collect twitter streaming.

In [2]:
import stweet as st
import timeout_decorator
import random

Get through the GFW :(

In [3]:
# create web client
proxy_url = "http://localhost:8889"
proxy_config = st.http_request.RequestsWebClientProxyConfig(
    http_proxy=proxy_url,
    https_proxy=proxy_url,
)
web_client = st.DefaultTwitterWebClientProvider.get_web_client()
web_client.proxy = proxy_config

WAIT_TIMEOUT = 3.8

The following code block loads tweet-ids from `avax-tweet-ids/` and pull down the valid content into `avax-tweets`. All these data come from [avax-tweets-dataset](https://github.com/gmuric/avax-tweets-dataset).

In [None]:
# create data dir
Path("avax-tweets/").mkdir(exist_ok=True)
# file list
glob_list = glob("avax-tweet-ids/*/*.txt")
random.shuffle(glob_list)

# collect anti-vaccine textual data
def load_tweets_id(filepath: Path) -> List[str]:
    with filepath.open("r", encoding="utf8") as fp:
        tmp = fp.readlines()
        ret = list(map(lambda x: x.strip(), tmp))
        random.shuffle(ret)
        return ret[:min(1000, len(ret))]

# scraper
@timeout_decorator.timeout(seconds=WAIT_TIMEOUT, use_signals=False)
def scrape_tweet_id(tweet_id: str):
    output_collect = st.CollectorRawOutput()
    id_task = st.TweetsByIdTask(tweet_id)
    st.TweetsByIdRunner(id_task, raw_data_outputs=[output_collect], web_client=web_client).run()
    return output_collect

for idx, path_str in enumerate(glob_list):
    print(f"Processing({idx + 1}/{len(glob_list)}): {path_str}")
    filepath = Path(path_str)
    tweet_ids = load_tweets_id(filepath)
    output_file_path = Path("avax-tweets").joinpath(f"{filepath.stem}.txt")
    if output_file_path.exists():
        # 已经走过的文件就不走了
        continue
    is_written = False
    fp = output_file_path.open("w", encoding="utf8")
    qbar = trange(len(tweet_ids))
    for i in qbar:
        tweet_id = tweet_ids[i]
        qbar.set_description(f"Tweet ID {tweet_id}")
        try:
            output_collect = scrape_tweet_id(tweet_id)
        except KeyError:
            continue
        except timeout_decorator.TimeoutError:
            continue
        except Exception as e:
            print_exception(e)
            sys.exit(-1)
        for raw_data in output_collect.get_raw_list():
            json_data = json.loads(raw_data.to_json_line())
            legacy_data = json_data["raw_value"]["legacy"]
            if legacy_data["id_str"] == tweet_id and legacy_data["lang"] == "en":
                try:
                    legacy2_data = json_data["raw_value"]["legacy"]["retweeted_status_result"]["result"]["legacy"]
                except KeyError:
                    legacy2_data = None
                if legacy2_data is not None:
                    out_str = legacy2_data["full_text"].replace("\n", " ").replace("\t", " ")
                else:
                    out_str = legacy_data["full_text"].replace("\n", " ").replace("\t", " ")
                fp.write(f"{tweet_id} {out_str}\n")
                fp.flush()
                is_written = True
                break
    qbar.close()
    fp.close()
    if not is_written:
        output_file_path.unlink(missing_ok=True)

Processing(1/9428): avax-tweet-ids/2020-11/2020-11-16-00.txt


Tweet ID 1328249501132468225:   2%|▏         | 5/237 [00:16<13:16,  3.43s/it]

After collecting the anti-vaccine data, we shall collect the normal data from [COVID19_Tweets_Dataset](https://github.com/lopezbec/COVID19_Tweets_Dataset).
Almost the same as the above code block.

In [None]:
# create data dir
Path("covid19-tweets/").mkdir(exist_ok=True)
# file list
glob_list = glob("covid19-tweet-ids/*/*.csv")
random.shuffle(glob_list)

# collect normal covid19 textual data
def load_non_negative_tweets_id(filepath: Path) -> List[str]:
    df = pd.read_csv(str(filepath))
    id_series = df[df["Sentiment_Label"] != "negative"]["Tweet_ID"].astype(str)
    return id_series.sample(n=min(1000, id_series.shape[0])).to_list()

# scraper
@timeout_decorator.timeout(seconds=WAIT_TIMEOUT, use_signals=False)
def scrape_tweet_id(tweet_id: str):
    output_collect = st.CollectorRawOutput()
    id_task = st.TweetsByIdTask(tweet_id)
    st.TweetsByIdRunner(id_task, raw_data_outputs=[output_collect], web_client=web_client).run()
    return output_collect

for idx, path_str in enumerate(glob_list):
    print(f"Processing({idx + 1}/{len(glob_list)}): {path_str}")
    filepath = Path(path_str)
    tweet_ids = load_non_negative_tweets_id(filepath)
    output_file_path = Path("covid19-tweets").joinpath(f"{filepath.stem}.txt")
    if output_file_path.exists():
        # 已经走过的文件就不走了
        continue
    is_written = False
    fp = output_file_path.open("w", encoding="utf8")
    qbar = trange(len(tweet_ids))
    for i in qbar:
        tweet_id = tweet_ids[i]
        qbar.set_description(f"Tweet ID {tweet_id}")
        try:
            output_collect = scrape_tweet_id(tweet_id)
        except KeyError:
            continue
        except timeout_decorator.TimeoutError:
            continue
        except Exception as e:
            print_exception(e)
            sys.exit(-1)
        for raw_data in output_collect.get_raw_list():
            json_data = json.loads(raw_data.to_json_line())
            legacy_data = json_data["raw_value"]["legacy"]
            if legacy_data["id_str"] == tweet_id and legacy_data["lang"] == "en":
                try:
                    legacy2_data = json_data["raw_value"]["legacy"]["retweeted_status_result"]["result"]["legacy"]
                except KeyError:
                    legacy2_data = None
                if legacy2_data is not None:
                    out_str = legacy2_data["full_text"].replace("\n", " ").replace("\t", " ")
                else:
                    out_str = legacy_data["full_text"].replace("\n", " ").replace("\t", " ")
                fp.write(f"{tweet_id} {out_str}\n")
                fp.flush()
                is_written = True
                break
    qbar.close()
    fp.close()
    if not is_written:
        output_file_path.unlink(missing_ok=True)

Processing(1/7838): covid19-tweet-ids/2021_03/2021_03_06_19_Summary_Sentiment.csv


Tweet ID 1368282855365836803:  11%|█         | 111/1000 [05:40<55:41,  3.76s/it]

## Data clean
After we collect enough data, we have to clean the data.

Cleaning the anti-vaccine tweets data.

In [4]:
import text_clean

# create new dir to store the clean data
clean_dir = Path("avax-tweets-clean/")
clean_dir.mkdir(exist_ok=True)

for path_str in tqdm(glob("avax-tweets/*.txt")):
    old_path = Path(path_str)
    new_path = clean_dir.joinpath(old_path.name)
    text_clean.preprocess_tweet_file(old_path, new_path)

100%|██████████| 41/41 [00:00<00:00, 50.36it/s]


Cleaning the normal covid19 tweets data.


In [5]:
import text_clean

# create new dir to store the clean data
clean_dir = Path("covid19-tweets-clean/")
clean_dir.mkdir(exist_ok=True)

for path_str in tqdm(glob("covid19-tweets/*.txt")):
    old_path = Path(path_str)
    new_path = clean_dir.joinpath(old_path.name)
    text_clean.preprocess_tweet_file(old_path, new_path)

100%|██████████| 13/13 [00:01<00:00, 12.28it/s]
