In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import emoji
import re

In [2]:
data = pd.read_csv("Bitcoin_tweets.csv", dtype="object")
print(data.shape)
data.head()

(809363, 13)


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"Guys evening, I have read this article about B...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


Let's create a dataframe of the 'x' variable and the 'y' variable. We want to use 'text' as the 'x' variable and 'user_verified' as the 'y' variable.

In [3]:
df = data[["user_verified", "text"]].iloc[:500]
df.head()

Unnamed: 0,user_verified,text
0,False,Blue Ridge Bank shares halted by NYSE after #b...
1,False,"😎 Today, that's this #Thursday, we will do a ""..."
2,False,"Guys evening, I have read this article about B..."
3,False,$BTC A big chance in a billion! Price: \487264...
4,False,This network is secured by 9 508 nodes as of t...


Let's clean the data by removing stopwords, emoji's, and punctuation so we can manipulate the text.

In [4]:
stopwords = ENGLISH_STOP_WORDS
punctuation = set(string.punctuation)

for i in range(len(df)):
    words = []
    df.iloc[i,-1] = df.iloc[i,-1].split()
    for word in df.iloc[i,-1]:
        if word.lower() not in stopwords:
            while len(word) > 0 and word[0] in punctuation:
                word = word[1:]
            while len(word) > 0 and word[-1] in punctuation:
                word = word[:-1]
            word = emoji.get_emoji_regexp().sub("", word)
            word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE)
            words.append(word)
    df.iloc[i,-1] = " ".join(words)

df.head()

Unnamed: 0,user_verified,text
0,False,Blue Ridge Bank shares halted NYSE bitcoin ATM...
1,False,Today that's Thursday 2 friend LeoWandersleb...
2,False,Guys evening read article BTC like share
3,False,BTC big chance billion Price 4872644.0 2021/02...
4,False,network secured 9 508 nodes today Soon biggest...


Let's transform the data using TfidfVectorizer in SKLearn.

In [5]:
# Transform
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words="english", max_features=None)
X = tfidf.fit_transform(df["text"])
y = df["user_verified"]
print(X.shape)
print(y.shape)

(500, 5755)
(500,)


In [6]:
# Split the data into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
# Evaluate the model
clf = MultinomialNB()
y_pred = clf.fit(X_train, y_train).predict(X_train)

print(f"Training set accuracy:\t{accuracy_score(y_train, y_pred)}")

y_pred = clf.fit(X_train, y_train).predict(X_test)
print(f"Testing set accuracy:\t{accuracy_score(y_test, y_pred)}")

Training set accuracy:	0.9942857142857143
Testing set accuracy:	0.9866666666666667


### Conclusion

The training set and testing set are both accurate. We can conlude that Multinomial Naive Bayes is a good algorithm for text classification to predict if a Tweeter user is verified based on tweets about Bitcoin.