In [1]:
import pandas as pd
import numpy as np

import cloudpickle
from tqdm.notebook import trange, tqdm

import re    # RegEx for removing non-letter characters
import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliorenteria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOADING PULLED TWEETS

In [3]:
# Loading pulled tweets as dataframes
folder = './pulled_tweets/'

covidTourism = pd.read_csv(folder+'covidTourism.csv')
TourismVaccinationUSA = pd.read_csv(folder+'TourismVaccinationUSA.csv')
USATourismVaccine = pd.read_csv(folder+'USATourismVaccine.csv')
vaccineTourism = pd.read_csv(folder+'vaccineTourism.csv')


In [6]:
# Concatenating DataFrames
frames = [covidTourism, TourismVaccinationUSA, USATourismVaccine, vaccineTourism]
data = pd.concat(frames)

print('concat finished correctly: ',covidTourism.shape[0]+TourismVaccinationUSA.shape[0]+USATourismVaccine.shape[0]+vaccineTourism.shape[0]==data.shape[0])
print('result rows: ',data.shape[0])
data = data.reset_index(drop=True)

data.tail()


concat finished correctly:  True
result rows:  983


Unnamed: 0,text,date
978,walk-in covid-19 vaccines are available in ver...,2021-05-23 14:08:01
979,today @ 12 shares covid updates. discusses...,2021-05-23 14:00:31
980,this means that those who have shots by sinova...,2021-05-23 13:47:05
981,vaccine tourism: russia vaccine tourism packag...,2021-05-23 13:11:11
982,tourism tax and a vaccine passport?\ni think i...,2021-05-23 12:45:08


# TOKENIZING PULLED TWEETS

In [7]:
# Function to format, tokenize and remove stopwords from tweets.
def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove tweeter users and hashtags ( @xxx, #xxx )
    text = re.sub(r"[@#]\w+", " ", text)
    # remove https
    text = re.sub(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)", " ", text)
    #text = " ".join(filter(lambda w: w.find("https://") == -1, text.split(" ")))
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #remove numbers
    text = re.sub(r"[0-9]*", "", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

In [8]:
# Testing tweet_to_words Function
sel = 103
print("\nORIGINAL TWEET ->\n", data['text'][sel])
print("\nPROCESSED TWEET ->", tweet_to_words(data['text'][sel]))



ORIGINAL TWEET ->
 with a new service having been set up to allow people aged 18 to 29 to register for vaccination, ms sturgeon said i…

PROCESSED TWEET -> ['new', 'servic', 'set', 'allow', 'peopl', 'age', 'regist', 'vaccin', 'ms', 'sturgeon', 'said']


In [15]:
# Apply tokenization to each tweet and store as X
X = []
for i in trange(len(data['text'])):
    X.append(tweet_to_words(data['text'][i]))

HBox(children=(FloatProgress(value=0.0, max=983.0), HTML(value='')))




# LOADING BEST MODELS

Best model:

- TOKENIZED         - Removed: Lowercase, URLs, Users, Hashtags, Nonletters, Numbers, Stopwords
- COUNT VECTORIZER  - to generate Bag of Words
- GRADIENT BOOSTING - N_estimators=100, Learning_rate=1.0, Max_depth=1, Random_state=0
- DATASET           - Used the [Coronavirus tweets NLP - Text Classification](https://www.kaggle.com/datatattle/covid-19-nlp-text-classification) Dataset


In [12]:
# Loading models
folder = './saved_models/'

model = cloudpickle.load(open(folder + 'xgboost_covid.pkl', 'rb'))
count_vectorizer = cloudpickle.load(open(folder + 'CV_covid.pkl', 'rb'))
CV_normalizer = cloudpickle.load(open(folder + 'CV_normalizer.pkl', 'rb'))


In [19]:
# Tee
X_bow = count_vectorizer.transform(X).toarray()
print(X_bow[1])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

# GETTING PREDICTIONS

In [22]:
data_pred = model.predict(X_bow)

In [23]:
data['sentiment'] = data_pred

In [24]:
data.head()

Unnamed: 0,text,date,sentiment
0,vaccine tourism in the us is becoming more pre...,2021-05-31 06:06:55,0
1,the tourism authority of kiribati (tak) welcom...,2021-05-31 05:09:50,2
2,‘like a dream’: latin americans head to u.s. f...,2021-05-31 04:28:08,2
3,nytimes: ‘like a dream’: americans head to u....,2021-05-31 04:13:04,2
4,vaccine tourism packages let rich asians flock...,2021-05-31 04:02:33,1


In [28]:
# Encoding setiment to {"0": -1, "1": 0, "2":1}
cleanup_nums = {"sentiment": {0: -1, 1: 0, 2:1}}
data_f = data.replace(cleanup_nums)
data_f.head()

Unnamed: 0,text,date,sentiment
0,vaccine tourism in the us is becoming more pre...,2021-05-31 06:06:55,-1
1,the tourism authority of kiribati (tak) welcom...,2021-05-31 05:09:50,1
2,‘like a dream’: latin americans head to u.s. f...,2021-05-31 04:28:08,1
3,nytimes: ‘like a dream’: americans head to u....,2021-05-31 04:13:04,1
4,vaccine tourism packages let rich asians flock...,2021-05-31 04:02:33,0


In [30]:
data_f.to_csv('./results.csv')

# CHECKING RESULTS

In [57]:
sel = 103
print(data_f.text[sel])
print(X[sel])
print(data_f.sentiment[sel])


with a new service having been set up to allow people aged 18 to 29 to register for vaccination, ms sturgeon said i…
['new', 'servic', 'set', 'allow', 'peopl', 'age', 'regist', 'vaccin', 'ms', 'sturgeon', 'said']
0


In [53]:
len(data_f.text[sel])

115