# Twitter API를 통해 Crypto tweet 수집

- API 튜토리얼 해보기
- BTC, ETH만 수집

특이사항: 
- 일론머스크 인수 후 (X로 개명) free tier에서 retrieve 불가. 
    - 할 수 없이 `twikit` 이라는 scraping 기반 라이브러리 사용
    - 주의 사항: 익명 temp 계정으로 만든 것만 사용할 것. 

In [1]:
import os, sys, warnings
from pathlib import Path

In [2]:
# warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
from tqdm import tqdm

In [4]:
import asyncio
import nest_asyncio
from twikit import Client

In [5]:
nest_asyncio.apply()

In [6]:
%load_ext dotenv
%dotenv

## `twikit` tutorial 

Twitter API 쓰지 않고 scraping 방식으로 tweet 불러옴. 

구글에서 advanced search 로 날짜 넣어 검색하듯, 트위터 검색창에 날짜 넣어 검색하는 식으로 트윗을 모아볼 수 있음. 

한 번 로드할 때 최대 20개만 되기 때문에 최대한 잘게 쪼개 1일씩 가져와 데이터를 생성해야 함. 

일단 테스트 용으로 5월1일 ~ 7월 10일까지의 데이터만 생성해 보겠음. 

### 기본 사용법

In [7]:
client = Client('en-US')

In [8]:
async def main():
    await client.login(
        auth_info_1=os.getenv('TWITTER_USERNAME'),
        auth_info_2=os.getenv('TWITTER_EMAIL'),
        password=os.getenv('TWITTER_PASSWORD'),
    )

In [9]:
asyncio.run(main())

In [10]:
tweets = await client.search_tweet('$ETH since:2023-05-05 until:2023-05-06', 'Latest', count=20)

In [11]:
tweets

[<Tweet id="1654636966367961088">, <Tweet id="1654636966338494467">, <Tweet id="1654636933266432003">, <Tweet id="1654636867051028480">, <Tweet id="1654636838307364864">, <Tweet id="1654636741616099328">, <Tweet id="1654636719998910465">, <Tweet id="1654636693033570305">, <Tweet id="1654636672112271360">, <Tweet id="1654636597113942016">]

In [12]:
len(tweets)

10

In [13]:
t = tweets[0]

In [14]:
t.created_at

'Fri May 05 23:59:29 +0000 2023'

In [18]:
t.created_at_datetime

datetime.datetime(2023, 5, 5, 23, 59, 29, tzinfo=datetime.timezone.utc)

In [15]:
t.text

'🟩 $ETH block trade alert [Deribit] 🧱🧱 \n\nMAY(26) $2,000 Calls bought 250 @ 0.0466 ETH($93.01); Total=11.6500 ETH($23,251.30); IV=50.13%; OI=44331; Index=$1,995.82'

In [19]:
t.full_text

'🟩 $ETH block trade alert [Deribit] 🧱🧱 \n\nMAY(26) $2,000 Calls bought 250 @ 0.0466 ETH($93.01); Total=11.6500 ETH($23,251.30); IV=50.13%; OI=44331; Index=$1,995.82'

### 날짜 범위를 주면 그만큼 데이터를 가져오는 함수 추가. 

혹시 모를 ban을 위해 한 번 가져올 때 3초씩 sleep을 주겠음. 

주의: 쓰면 안되는 것들
- view count
- favorite count
- retweets

등등

PIT(Point-In-Time)하지 않음. 

Data 단에서 벌써 forward-looking 해버릴 수 있음. 

사후 검증 방법: live에서 batch로 append 할 때 제대로 안쌓이면 forward looking 

In [55]:
class TooManyRequests(Exception):
    pass

In [56]:
async def get_tweets_from_date(search_term, date, how='Latest'):
    # date format should be 'YYYY-MM-DD'

    since = (pd.to_datetime(date) - pd.Timedelta(days=1)).strftime('%Y-%m-%d')
    until = date

    # try:
    #     tweets = await client.search_tweet(f'{search_term} since:{since} until:{until}', how, count=20)
    # except TooManyRequests as e:
    #     raise e
    
    tweets = await client.search_tweet(f'{search_term} since:{since} until:{until}', how, count=20)
    
    
    data = [
        {
            'created_at': t.created_at_datetime,
            'user_id': t.user.id,
            'user_createdat': t.user.created_at,
            'user_name': t.user.name,
            'full_text': t.full_text,
            'place': t.place,
            'hashtags': t.hashtags,
            'has_card': t.has_card,
            'thumbnail_title': t.thumbnail_title,
            'urls': t.urls,
        }
        for t in tweets
    ]

    return pd.DataFrame(data)

    

In [57]:
async def get_tweets_from_date_range(search_term, start_date, end_date, how='Latest', sleep=3, long_sleep=15*60):
    # date format should be 'YYYY-MM-DD'

    dates = pd.date_range(start_date, end_date, freq='D').strftime('%Y-%m-%d')

    dfs = []

    for date in tqdm(dates):
        try:
            df = await get_tweets_from_date(search_term, date, how)
            dfs.append(df)
        # except TooManyRequests:
        except Exception as e:
            print(e)
            print(f'Sleeping for {long_sleep} seconds...')
            await asyncio.sleep(long_sleep)

            df = await get_tweets_from_date(search_term, date, how)
            dfs.append(df)
        finally:
            await asyncio.sleep(sleep)
            

    return pd.concat(dfs, ignore_index=True)


In [58]:
START_DATE = '2024-05-01'
END_DATE = '2024-07-17'

search_term = '$ETH'


In [59]:
df = await get_tweets_from_date_range(search_term, START_DATE, END_DATE)

 28%|██▊       | 22/78 [01:22<03:33,  3.82s/it]

status: 429, message: "Rate limit exceeded
"
Sleeping for 900 seconds...


 92%|█████████▏| 72/78 [19:29<00:22,  3.70s/it]   

status: 429, message: "Rate limit exceeded
"
Sleeping for 900 seconds...


100%|██████████| 78/78 [34:52<00:00, 26.83s/it] 


나중에 rate limit exception은 고쳐보자... 

In [72]:
df[['user_id', 'user_createdat', 'user_name', 'full_text', 'place', 'hashtags', 'thumbnail_title', 'urls']] = df[['user_id', 'user_createdat', 'user_name', 'full_text', 'place', 'hashtags', 'thumbnail_title', 'urls']].astype(str)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 808 entries, 0 to 807
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   created_at       808 non-null    object
 1   user_id          808 non-null    object
 2   user_createdat   808 non-null    object
 3   user_name        808 non-null    object
 4   full_text        808 non-null    object
 5   place            808 non-null    object
 6   hashtags         808 non-null    object
 7   has_card         808 non-null    bool  
 8   thumbnail_title  808 non-null    object
 9   urls             808 non-null    object
dtypes: bool(1), object(9)
memory usage: 57.7+ KB


In [75]:
df['created_at'] = pd.to_datetime(df['created_at'])

In [76]:
df.to_pickle('ETH_tweets.pkl')