In [8]:
### Search through Twitter API ###

# Import Packages
import tweepy
import config
import pandas as pd
import numpy as np
import pandas as pd
import nltk
import tweetpreprocess as tp
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
lmtzr = nltk.WordNetLemmatizer()
pd.options.mode.chained_assignment = None

# Keys
my_bearer_token = config.BEARER_TOKEN

# Get token
client = tweepy.Client(bearer_token=my_bearer_token)

# API query and response
query = '(donald trump) -is:retweet lang:en'
# response = client.search_recent_tweets(query=query, max_results=101)

# Dataset placeholder
data = []

# Fill data
for tweet in tweepy.Paginator(client.search_recent_tweets, query=query, max_results=100).flatten(limit=3000):
    data.append(tweet.text.encode('utf-8'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hans\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hans\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
### Setting up dataset ###
# Create test dataset
columns = ['text']
test_data = pd.DataFrame(data, columns=columns)
test_data['text'] = test_data['text'].str.decode("utf-8")
test_data.to_excel(excel_writer = "test_tweets.xlsx")

In [11]:
### Pre-processing ###
from textblob import TextBlob
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Check the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

def getAnalysis(score):
    if score < 0:
        return '-1'
    elif score == 0:
        return '0'
    else:
        return '1'

# Excel to Dataframe (for saving purposes)
full_test_df = pd.read_excel('test_tweets.xlsx')
test_tweet_df = full_test_df[["text"]]
test_tweet_df["text"] = test_tweet_df["text"].astype(str)

# Tweet pre-processing 
def PreProcessTweets(tweet_df):
    # labeling with Polarity
    tweet_df["Label"] = tweet_df['text'].apply(getPolarity)
    # Labeling the Analysis
    tweet_df["Analysis"] = tweet_df['Label'].apply(getAnalysis)
    # Lowercasing
    tweet_df["text"] = tweet_df["text"].str.lower()
    # Remove URLs
    tweet_df["text"] = tweet_df["text"].apply(lambda text: tp.remove_urls(text))
    # Remove Hashtags
    tweet_df["text"] = tweet_df["text"].apply(lambda text: tp.remove_hashtags(text))
    # Remove usernames
    tweet_df["text"] = tweet_df["text"].apply(lambda text: tp.remove_users(text))
    # Convert emojis to words
    tweet_df["text"] = tweet_df["text"].apply(lambda text: tp.emojiToWord(text))
    # Remove extra whitespaces
    tweet_df["text"] = tweet_df["text"].apply(lambda text: tp.remove_extraws(text))
    # Spell checker
    tweet_df["text"] = tweet_df["text"].apply(lambda text: tp.correct_spellings(text))
    # Tokenizer
    tweet_df["text"] = tweet_df["text"].apply(lambda text: nltk.word_tokenize(text, language='english'))
    # Lemmatizer
    tweet_df["text"] = tweet_df["text"].apply(lambda lst:[lmtzr.lemmatize(word) for word in lst])
    # Remove stopwords
    tweet_df["text"] = tweet_df["text"].apply(lambda text: tp.remove_sw(text))
    return tweet_df

preproc_test = PreProcessTweets(test_tweet_df)
preproc_test.to_excel(excel_writer = "preprocessed lemma.xlsx")

In [12]:
### Modeling ###

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from nltk.corpus import stopwords

model_train_df= pd.read_excel("preprocessed lemma.xlsx")

# Flattening
flattened = []

stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='unicode', stop_words=stopset)
y = model_train_df.Analysis
X = vectorizer.fit_transform(model_train_df.text)


In [13]:
print(y.shape)
print(X.shape)

(3000,)
(3000, 5613)


In [14]:
### Modeling pt.2 ###

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [15]:
# ROC 
y_pred_class = clf.predict(X_test)
 
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))


0.708


In [None]:
# Test
test_array=np.array([""])
test_vector = vectorizer.transform(test_array)
print(clf.predict(test_vector))

[1]
