In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

twitter_bearer_token = os.getenv('TW_BEARER_TOKEN')

from langchain.document_loaders import TwitterTweetLoader

In [2]:
def get_users_twitter(file: str) -> list:
    with open(file, 'r') as f:
        usernames = f.read().splitlines()
    return usernames

In [8]:
loader = TwitterTweetLoader.from_bearer_token(
    oauth2_bearer_token=twitter_bearer_token,
    twitter_users=get_users_twitter('/mnt/c/Users/kozan/Desktop/Sen_Des_Proj/GPT-4-KZEngine-Signal-Interpretation/twitter-sentiment/list_users_twitter.txt'),
    number_tweets=20000,  # Default value is 100
)

In [9]:
contents = []
documents = loader.load()
for doc in documents:
    contents.append(doc.dict())
len(contents)

1966

In [21]:
import pandas as pd

df = pd.DataFrame(contents)

df['created_at'] = pd.to_datetime(df['metadata'].apply(lambda x: x['created_at']), format='%a %b %d %H:%M:%S %z %Y')
df['screen_name'] = df['metadata'].apply(lambda x: x['user_info']['screen_name'])
df = df[['screen_name', 'page_content', 'created_at']]
df = df.rename(columns={'page_content': 'text', 'screen_name': 'username'})

In [22]:
def cleaning_tweet_data( df: pd.DataFrame()):
        import re
        df_tweets = df.copy()
        if 'Unnamed: 0' in df_tweets.columns:
            df_tweets.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
        if 'source' in df_tweets.columns:
            df_tweets.drop(columns=['source', 'name', 'location', 'verified', 'description'], axis=1, inplace=True)
            

        df_tweets = df_tweets.apply(lambda x: x.astype(str).str.lower()).drop_duplicates(subset=['text', 'username'],
                                                                                         keep='first')
        
        df_tweets['text'] = df_tweets['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
        df_tweets['text'] = df_tweets['text'].str.lower()
        df_tweets['text'] = df_tweets['text'].str.replace("@[a-z0-9A-Z]+", "", regex=True)
        df_tweets['text'] = df_tweets['text'].str.replace("#[a-z0-9A-Z]+", "", regex=True)

        blanks = []  # start with an empty list
        for i, created_at, text, *username in df_tweets.itertuples():
            if type(text) == str:
                if text.isspace():
                    blanks.append(i)
        df_tweets.drop(blanks, inplace=True)
        # df_tweets['text'] = df_tweets['text'].str.replace(r"http\S+", "")
        # df_tweets['text'] = df_tweets['text'].str.replace(r"www.\S+", "")
        # df_tweets['text'] = df_tweets['text'].str.replace('[()!?]', ' ')
        # df_tweets['text'] = df_tweets['text'].str.replace('\[.*?\]',' ')
        # df_tweets['text'] = df_tweets['text'].str.replace("[^a-z0-9]"," ")


        df_tweets.dropna(inplace=True)


        return df_tweets

In [23]:
df['username'] = df['username'].astype(str)
df['text'] = df['text'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   username    1966 non-null   object             
 1   text        1966 non-null   object             
 2   created_at  1966 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 46.2+ KB


In [24]:
df = cleaning_tweet_data(df)

In [27]:
def preprocessing_tweet_datetime(df: pd.DataFrame()) -> pd.DataFrame():
        """
        For adding datetime groups Date, Hour, Minute to existing Dataframe.
            It uses to copy of existing Dataframe
        Args:
            self (tsa) 
            df (DataFrame)

        Returns:
            DataFrame
        """
        df_temp = df.copy()
        # Fixed for some blank tweets: 'tweets'. Interesting api result delete below with function
        df_temp.drop(df_temp[df_temp.created_at == 'twitter'].index, inplace=True)
        df_temp.created_at = pd.to_datetime(df_temp.created_at)
        df_temp['Date'] = df_temp.created_at.apply(lambda x: x.date())
        df_temp['hour'] = df_temp.created_at.apply(lambda x: x.hour)
        df_temp['minute'] = df_temp.created_at.apply(lambda x: x.minute)
        return df_temp

In [29]:
df = preprocessing_tweet_datetime(df)
df

Unnamed: 0,username,text,created_at,Date,hour,minute
0,cryptocapo_,after hours and hours of deep analysis and re...,2023-04-11 12:27:50+00:00,2023-04-11,12,27
1,cryptocapo_,rt _: some thoughts.\n\ni'll continue to ignor...,2023-04-11 12:06:35+00:00,2023-04-11,12,6
2,cryptocapo_,_mc i've repeated many times that i'm short o...,2023-04-11 11:50:45+00:00,2023-04-11,11,50
3,cryptocapo_,the joke would at least be funny if i shorted...,2023-04-11 11:46:53+00:00,2023-04-11,11,46
4,cryptocapo_,everything is based on probabilities. the mor...,2023-04-11 08:19:40+00:00,2023-04-11,8,19
...,...,...,...,...,...,...
1961,elonmusk,ad-free version if you subscribe to zuby!,2023-06-17 19:14:41+00:00,2023-06-17,19,14
1962,elonmusk,rt : real talk with zuby podcast ep. 263 - \n...,2023-06-17 19:12:20+00:00,2023-06-17,19,12
1963,elonmusk,this is very concerning,2023-06-17 19:10:29+00:00,2023-06-17,19,10
1964,elonmusk,🤣🤣 that would definitely work for falling asl...,2023-06-17 18:36:18+00:00,2023-06-17,18,36
