In [9]:
import pandas as pd

============= Load Data =============

In [10]:
df = pd.read_csv(r'data/rawdata/1_B_btc_tweet_data_2013-05-01_2019-11-01.csv', sep=',', usecols=['timestamp','text'], parse_dates=['timestamp'], dtype={'text': str})

In [11]:
df.tail()

Unnamed: 0,timestamp,text
18301011,2017-11-29 05:19:33+00:00,"$BTCUSD #Bitcoin hits 10,0000 today,\nhas made..."
18301012,2019-02-04 02:12:55+00:00,#Bitcoin #blockchain #cryptocurrency https://...
18301013,2018-09-15 16:10:01+00:00,"Welcome to Bitcoin, newcomers! Here's your FAQ..."
18301014,2019-08-29 14:18:24+00:00,投資やFX📈って興味あるけどなかなか手が進まない方は 以下のURLからの無料登録(要本人確認...
18301015,2018-11-13 03:08:51+00:00,Hasan Minhaj really came for the Bitcoin bros ...


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18301016 entries, 0 to 18301015
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   timestamp  object
 1   text       object
dtypes: object(2)
memory usage: 279.3+ MB


============= Remove Missing Values =============

In [13]:
# drop missing values
df.dropna(inplace=True, axis=0)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18300265 entries, 0 to 18301015
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   timestamp  object
 1   text       object
dtypes: object(2)
memory usage: 418.9+ MB


============= Tweet Text Cleaning =============

In [18]:
# List of words to check
words_to_exclude = ['btc', 'bitcoin']

filtered_df = df[~df['text'].str.contains('|'.join(words_to_exclude), case=False)].head(100)

# Display the result
print(filtered_df)

                     timestamp  \
8    2019-05-27 11:49:32+00:00   
9    2019-05-27 11:49:32+00:00   
15   2019-05-10 14:06:01+00:00   
20   2019-05-26 20:55:29+00:00   
27   2019-05-27 11:49:45+00:00   
..                         ...   
716  2019-05-11 14:30:30+00:00   
717  2019-05-11 15:01:36+00:00   
725  2019-05-27 11:57:54+00:00   
749  2019-05-27 11:58:19+00:00   
752  2019-05-27 11:58:44+00:00   

                                                  text  
8    ブラジルはまぁ置いといてもドイツは存在感出してくるのかな。ロシアもマイニングなどで元気になる...  
9    CHANGE IS COMING...GET READY!!! Boom, Another ...  
15                                               share  
20   $HOT $HOT $HOT 🍀\n\n🚀🚀🚀🚀🚀\n\n🔥🔥🔥🔥🔥 https://t.c...  
27                                                 🤣🤣🤣  
..                                                 ...  
716                                          Nice Work  
717                                                Kuy  
725                            お願いします。\n今後の値動き教えてください。  
749          #QuarkCh

In [21]:
# !pip install certifi
# !pip install demoji
# !pip install emoji
# !pip install nltk
# !pip install pyenchant
# !pip install pyspellchecker
# !pip install spellchecker

In [25]:
import re
import demoji
import unicodedata
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Download NLTK resources (https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml)
import nltk

# Downloaded NLTK data path
nltk.data.path.append('data/nltk')

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

In [33]:
# Load English stop words
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Unicode Normalization
    text = unicodedata.normalize('NFKD', text)

    # Remove Emoji
    text = demoji.replace_with_desc(text, sep=' ')

    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(' +', ' ', text)

    # # Convert to lowercase
    # text = text.lower()

    # # Tokenize text
    # words = word_tokenize(text)

    # # Remove stopword
    # words = [word for word in words if word not in stop_words]

    # # Spelling Correction
    # spell = SpellChecker()
    # words = [spell.correction(word) for word in words if word is not None]  # Filter out None values

    # # Reassemble text
    # cleaned_text = ' '.join(filter(None, words))  # Filter out None values

    return text

In [34]:
from joblib import Parallel, delayed

# Number of processes to use (adjust as needed)
num_processes = 7

def parallelize_dataframe(df, func):
    return Parallel(n_jobs=num_processes)(delayed(func)(text) for text in df['text'])

In [36]:
# Apply the cleaning function in parallel
df['cleaned_text'] = parallelize_dataframe(df, clean_text)

In [37]:
df.head()

Unnamed: 0,timestamp,text,cleaned_text
0,2019-05-27 11:49:14+00:00,È appena uscito un nuovo video! LES CRYPTOMONN...,E appena uscito un nuovo video LES CRYPTOMONNA...
1,2019-05-27 11:49:18+00:00,Cardano: Digitize Currencies; EOS https://t.co...,Cardano Digitize Currencies EOS ROI ATampT Bit...
2,2019-05-27 11:49:06+00:00,Another Test tweet that wasn't caught in the s...,Another Test tweet that wasnt caught in the st...
3,2019-05-27 11:49:22+00:00,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,Current Crypto Prices \n\nBTC USD\nETH USD\nLT...
4,2019-05-27 11:49:23+00:00,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,Spiv Nosar Baz BITCOIN Is An Asset amp NOT A C...


In [None]:
# %reset -f