In [226]:
import nltk
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
import numpy as np
import featuretools as ft
import featuretools.variable_types as vtypes
from featuretools.primitives import *
from nlp_primitives import (
    DiversityScore,
    LSA,
    MeanCharactersPerWord,
    PartOfSpeechCount,
    PolarityScore, 
    PunctuationCount,
    StopwordCount,
    TitleWordCount,
    UniversalSentenceEncoder,
    UpperCaseCount)
from imblearn.over_sampling import RandomOverSampler




tweetsAndLabels = pd.read_csv('MajorityVote.csv')
tweetsAndLabels['Created'] = pd.to_datetime(tweetsAndLabels['Created'])
#tweetsAndLabels.set_index('Created',inplace = True)
tweetsAndLabels = tweetsAndLabels.iloc[0:3499]
#replacing NAs with '' for concatenating 'Full Text' and 'Quoted Full Text'
tweetsAndLabels = tweetsAndLabels.replace(np.nan, '', regex=True)
#concatenating 'Full Text' and 'Quoted Full Text'
tweetsAndLabels['Full Text'] = tweetsAndLabels['Full Text'] + ' ' + tweetsAndLabels['Quote Full Text']

#Getting the four HBM Label Categories
categories = list(tweetsAndLabels.columns.values)
categories = categories[7:12]


#def clean_text(text):
  #  text = "".join([word.lower() for word in text if word not in string.punctuation])
   # tokens = re.split('\W+', text)
  #  text = [ps.stem(word) for word in tokens if word not in stopwords]
  #  return text

#cl#ean_text(tweetsA)




In [227]:
#defining clean_text function
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import regexp_tokenize

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
tknzr = TweetTokenizer()

def clean_textTwo(text):
    text = re.sub(r"http\S+", "", text)
    text = "".join([word.lower() for word in text if word not in punctuation])
    tokens = tknzr.tokenize(text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [None]:
#instantiating functions that generates new features
DS = DiversityScore()
MCPW = MeanCharactersPerWord()
Pol = PolarityScore()
PC = PunctuationCount()
SC = StopwordCount()
TWC = TitleWordCount()
UCC = UpperCaseCount()
USE = UniversalSentenceEncoder()

#defining the function that checks for free standing numbers
def hasNumbers(inputString):
    return bool(re.search(r'\s\d+\s', inputString))


#adding new features to the dataframe
tweetsAndLabels['hasNum'] = tweetsAndLabels['Full Text'].apply(lambda x:hasNumbers(x))
NewFeaturesFunct = [DS,MCPW,Pol,PC,SC,TWC,UCC]
featureNames = ['DS','MCPW','Pol','PC','SC','TWC','UCC']
for featureName, featureFunct in zip(featureNames,NewFeaturesFunct):
    tweetsAndLabels[featureName] = featureFunct(tweetsAndLabels['Full Text'])
tweetsAndLabels = tweetsAndLabels.dropna()


In [229]:
#Train Test Split
HBMRelatedTweets = tweetsAndLabels.loc[tweetsAndLabels['HBM Related']==1.0]
from sklearn.model_selection import train_test_split

train, test = train_test_split(tweetsAndLabels, random_state=42, test_size=0.3, shuffle=True)

print(train.shape)
print(test.shape)
train_text = train['Full Text']
test_text = test['Full Text']

print(train.shape)
print(test.shape)


(2448, 21)
(1050, 21)
(2448, 21)
(1050, 21)


In [230]:
#TF-IDF vectorization
vectorizer = TfidfVectorizer(analyzer=clean_textTwo)
#vectorizer = CountVectorizer(analyzer=clean_textTwo)
vectorizer.fit(train_text)
vectorizer.fit(test_text)
newFeatures_train = train[['DS','MCPW','Pol','PC','SC','TWC','UCC','hasNum']]
newFeatures_test = test[['DS','MCPW','Pol','PC','SC','TWC','UCC','hasNum']]

#converting sparse matrix to dataframe and preping the vectorized matrix to be concatenated with the new features
X_tfidf_train = vectorizer.transform(train_text)
X_tfidf_train = pd.DataFrame(X_tfidf_train.toarray())
X_tfidf_test = vectorizer.transform(test_text)
X_tfidf_test = pd.DataFrame(X_tfidf_test.toarray())
X_tfidf_test.reset_index(drop=True, inplace=True)
X_tfidf_train.reset_index(drop=True, inplace=True)
newFeatures_test.reset_index(drop=True, inplace=True)
newFeatures_train.reset_index(drop=True, inplace=True)
print(newFeatures_test.shape)
print(X_tfidf_test.shape)
print(newFeatures_train.shape)
print(X_tfidf_train.shape)

(1050, 8)
(1050, 5711)
(2448, 8)
(2448, 5711)


In [231]:
# combining new features to the vectorized data frame
#x_train = vectorizer.transform(train_text)
x_train = pd.concat([newFeatures_train,X_tfidf_train],axis = 1)
y_train = train.drop(labels = ['Created', 'Day', 'Tweet ID', 'Full Text', 'Quote Full Text',
       'Hashtags', 'Not mask or face covering related','HBM Related AutoFill','HBM Related','DS','MCPW','Pol','PC','SC','TWC','UCC','hasNum'], axis=1)

#x_test = vectorizer.transform(test_text)
x_test = pd.concat([newFeatures_test,X_tfidf_test],axis = 1)
y_test = test.drop(labels = ['Created', 'Day', 'Tweet ID', 'Full Text', 'Quote Full Text',
       'Hashtags', 'Not mask or face covering related','HBM Related AutoFill','HBM Related','DS','MCPW','Pol','PC','SC','TWC','UCC','hasNum'], axis=1)

In [232]:
#ONE VS REST Classifier aka. restricting each tweet to have only one label
#Using Gradient Boost 
#Baseline model with no resampling of minority class

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from IPython.display import Markdown, display 


# Using pipeline for applying Gradient Boosting and one vs rest classifier
GB_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(GradientBoostingClassifier())),
            ])

for category in categories:
    printmd('**Processing {} Tweets...**'.format(category))
    
    # Training GB model on train data
    GB_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    predGB = GB_pipeline.predict(x_test)
    print(classification_report(test[category], predGB, labels=[0,1]))

**Processing HBM Related Tweets...**

              precision    recall  f1-score   support

           0       0.71      0.44      0.54       434
           1       0.69      0.88      0.77       616

    accuracy                           0.70      1050
   macro avg       0.70      0.66      0.66      1050
weighted avg       0.70      0.70      0.68      1050



**Processing Perceived Benefits Tweets...**

              precision    recall  f1-score   support

           0       0.76      0.96      0.85       741
           1       0.73      0.27      0.39       309

    accuracy                           0.76      1050
   macro avg       0.74      0.61      0.62      1050
weighted avg       0.75      0.76      0.71      1050



**Processing Perceived Barriers Tweets...**

              precision    recall  f1-score   support

           0       0.75      0.98      0.85       771
           1       0.60      0.10      0.17       279

    accuracy                           0.74      1050
   macro avg       0.67      0.54      0.51      1050
weighted avg       0.71      0.74      0.67      1050



**Processing Perceived Severity Tweets...**

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       999
           1       0.29      0.14      0.19        51

    accuracy                           0.94      1050
   macro avg       0.62      0.56      0.58      1050
weighted avg       0.92      0.94      0.93      1050



**Processing Perceived Susceptibility Tweets...**

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1007
           1       0.20      0.05      0.08        43

    accuracy                           0.95      1050
   macro avg       0.58      0.52      0.53      1050
weighted avg       0.93      0.95      0.94      1050



In [233]:
#ONE VS REST Classifier aka. restricting each tweet to have only one label
#Using logistic regression
#Baseline model with no resampling of minority class
log_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000))),
            ])

for category in categories:
    printmd('**Processing {} Tweets...**'.format(category))
    # Training logistic regression model on train data
    log_pipeline.fit(x_train, train[category])
    # calculating test accuracy
    predLog = log_pipeline.predict(x_test)
    print(classification_report(test[category], predLog, labels=[0,1]))

**Processing HBM Related Tweets...**

              precision    recall  f1-score   support

           0       0.67      0.45      0.54       434
           1       0.69      0.85      0.76       616

    accuracy                           0.68      1050
   macro avg       0.68      0.65      0.65      1050
weighted avg       0.68      0.68      0.67      1050



**Processing Perceived Benefits Tweets...**

              precision    recall  f1-score   support

           0       0.74      0.96      0.84       741
           1       0.70      0.20      0.31       309

    accuracy                           0.74      1050
   macro avg       0.72      0.58      0.58      1050
weighted avg       0.73      0.74      0.68      1050



**Processing Perceived Barriers Tweets...**

              precision    recall  f1-score   support

           0       0.75      0.98      0.85       771
           1       0.62      0.11      0.18       279

    accuracy                           0.75      1050
   macro avg       0.69      0.54      0.52      1050
weighted avg       0.72      0.75      0.67      1050



**Processing Perceived Severity Tweets...**

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       999
           1       0.00      0.00      0.00        51

    accuracy                           0.95      1050
   macro avg       0.48      0.50      0.49      1050
weighted avg       0.91      0.95      0.93      1050



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Processing Perceived Susceptibility Tweets...**

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1007
           1       1.00      0.02      0.05        43

    accuracy                           0.96      1050
   macro avg       0.98      0.51      0.51      1050
weighted avg       0.96      0.96      0.94      1050



In [234]:
#oversampling of minority class and using gradient boosting 
GB_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(GradientBoostingClassifier())),
            ])

for category in categories:
    printmd('**Processing {} Tweets...**'.format(category))
    
    #Random Over Sampling
    if(category != 'HBM Related'):
        ros = RandomOverSampler(random_state=777)
        ros_xtrain_tfidf, ros_train_y = ros.fit_sample(x_train, train[category])
    else:
        ros_xtrain_tfidf = x_train
        ros_train_y = train[category]
    # Training logistic regression model on train data
    GB_pipeline.fit(ros_xtrain_tfidf, ros_train_y)
    
    # calculating test accuracy
    predGB = GB_pipeline.predict(x_test)
    print(classification_report(test[category], predGB, labels=[0,1]))

**Processing HBM Related Tweets...**

              precision    recall  f1-score   support

           0       0.74      0.42      0.53       434
           1       0.69      0.89      0.78       616

    accuracy                           0.70      1050
   macro avg       0.71      0.66      0.66      1050
weighted avg       0.71      0.70      0.68      1050



**Processing Perceived Benefits Tweets...**

              precision    recall  f1-score   support

           0       0.81      0.76      0.79       741
           1       0.51      0.58      0.54       309

    accuracy                           0.71      1050
   macro avg       0.66      0.67      0.67      1050
weighted avg       0.72      0.71      0.72      1050



**Processing Perceived Barriers Tweets...**

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       771
           1       0.43      0.50      0.46       279

    accuracy                           0.69      1050
   macro avg       0.62      0.63      0.62      1050
weighted avg       0.71      0.69      0.70      1050



**Processing Perceived Severity Tweets...**

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       999
           1       0.37      0.45      0.40        51

    accuracy                           0.94      1050
   macro avg       0.67      0.71      0.68      1050
weighted avg       0.94      0.94      0.94      1050



**Processing Perceived Susceptibility Tweets...**

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1007
           1       0.24      0.40      0.30        43

    accuracy                           0.92      1050
   macro avg       0.60      0.67      0.63      1050
weighted avg       0.94      0.92      0.93      1050



In [235]:
#logistic regression with oversampling of minority class
log_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000))),
            ])

for category in categories:
    printmd('**Processing {} Tweets...**'.format(category))
    
    #Random Over Sampling For unbalanced categories
    if(category != 'HBM Related'):
        ros = RandomOverSampler(random_state=777)
        ros_xtrain_tfidf, ros_train_y = ros.fit_sample(x_train, train[category])
    else:
        ros_xtrain_tfidf = x_train
        ros_train_y = train[category]
    # Training logistic regression model on train data
    log_pipeline.fit(ros_xtrain_tfidf, ros_train_y)
    print(ros_train_y.shape)
    # calculating test accuracy
    predLog = log_pipeline.predict(x_test)
    print(classification_report(test[category], predLog, labels=[0,1]))

**Processing HBM Related Tweets...**

(2448,)
              precision    recall  f1-score   support

           0       0.67      0.45      0.54       434
           1       0.69      0.85      0.76       616

    accuracy                           0.68      1050
   macro avg       0.68      0.65      0.65      1050
weighted avg       0.68      0.68      0.67      1050



**Processing Perceived Benefits Tweets...**

(3360,)
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       741
           1       0.58      0.55      0.57       309

    accuracy                           0.75      1050
   macro avg       0.70      0.69      0.70      1050
weighted avg       0.75      0.75      0.75      1050



**Processing Perceived Barriers Tweets...**

(3534,)
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       771
           1       0.49      0.46      0.48       279

    accuracy                           0.73      1050
   macro avg       0.65      0.64      0.65      1050
weighted avg       0.72      0.73      0.73      1050



**Processing Perceived Severity Tweets...**

(4660,)
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       999
           1       0.36      0.31      0.34        51

    accuracy                           0.94      1050
   macro avg       0.66      0.64      0.65      1050
weighted avg       0.94      0.94      0.94      1050



**Processing Perceived Susceptibility Tweets...**

(4674,)
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1007
           1       0.37      0.40      0.38        43

    accuracy                           0.95      1050
   macro avg       0.67      0.68      0.68      1050
weighted avg       0.95      0.95      0.95      1050



In [176]:
#a test checking for feature importances
rf = RandomForestClassifier(n_jobs=-1)
ros = RandomOverSampler(random_state=777)
ros_xtrain_tfidf, ros_train_y = ros.fit_sample(x_train, train['Perceived Susceptibility'])
rf_model = rf.fit(ros_xtrain_tfidf, ros_train_y)
sorted(zip(rf_model.feature_importances_, x_train.columns), reverse=True)[0:10]

[(0.04700022803666276, 2099),
 (0.02165290384386741, 3711),
 (0.017560750780973078, 'MCPW'),
 (0.017559941844709766, 'UCC'),
 (0.017352601318697772, 3944),
 (0.01705589809579206, 'hasNum'),
 (0.013732540649165324, 'Pol'),
 (0.01308159696645004, 'DS'),
 (0.013031913418364384, 2325),
 (0.012212217417145019, 'PC')]