# Building Classifier to Label Tweets as hate/non-hate

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

### Load dataset:

In [2]:
tweets = pd.read_csv("/Users/mbp/Documents/NYU/Statistical Consulting/Project/tweets_withsent.csv")

### Create clean datasets for modeling:

In [3]:
df_clean = pd.DataFrame({"text": tweets['text'], "label": tweets['label']})
df_clean.shape

(5547, 2)

### Modeling:

In [4]:
# Create function to vectorize each tweet into separate words, using the term-freq, inverse document frequency method:
# Function requires column with text to be labeled as 'text':

def tfidfvec(data, label):
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    tf_vec = TfidfVectorizer(stop_words = 'english')
    tweets_label = [row['text'] for index,row in data.iterrows() if row['label'] == label]
    words_tweetslabel = tf_vec.fit_transform(tweets_label)
    tdm_label = pd.DataFrame(words_tweetslabel.toarray(), columns=tf_vec.get_feature_names())
    tdm_label['label'] = label
    return(tdm_label)

In [5]:
# Create function to vectorize each tweet into separate words, and create corpus of words for each class:
# Function requires column with text to be labeled as 'text':

def countvec(data, label):
    from sklearn.feature_extraction.text import CountVectorizer
    
    count_vec = CountVectorizer(stop_words = 'english')
    tweets_label = [row['text'] for index,row in data.iterrows() if row['label'] == label]
    words_tweetslabel = count_vec.fit_transform(pd.DataFrame(tweets_label)[0].values.astype(str))
    tdm_label = pd.DataFrame(words_tweetslabel.toarray(), columns=count_vec.get_feature_names())
    tdm_label['label'] = label
    return(tdm_label)

In [6]:
# Create term-document matricies for both classes (hate and non-hate):
tdm_hate = countvec(df_clean, 1)
tdm_nonhate = countvec(df_clean,0)

In [7]:
# Create data frame to be used for modeling:
model_df = pd.concat([tdm_hate, tdm_nonhate])
model_df = model_df.fillna(0)

In [8]:
model_df.shape

(5104, 10753)

### Training and Validation Sets:

In [9]:
# Create training (to be split into train-test splits), 
# and validation data (to test on completely out of sample data)

In [10]:
df_train = model_df.sample(n = 4000)
X = df_train.drop('label', axis = 1)
y = df_train.loc[:,'label']

val = model_df.drop(df_train.index)
X_val = val.drop('label', axis = 1)
y_val = val.loc[:,'label']

# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify = y)
print('Train/Test Sizes : ', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Train/Test Sizes :  (2800, 10752) (1200, 10752) (2800,) (1200,)


In [11]:
# Use SMOTE to oversample positive (hate) cases: 

from imblearn.over_sampling import SMOTE
sm = SMOTE(k_neighbors = 10)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

### Logistic Regression

In [12]:
# Fit logistic regression:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train_res, y_train_res)
log_preds = log_reg.predict(X_test)

cm_log = confusion_matrix(y_test, log_preds)
print(cm_log)
print(classification_report(y_test, log_preds))

[[848 195]
 [ 47 110]]
              precision    recall  f1-score   support

           0       0.95      0.81      0.88      1043
           1       0.36      0.70      0.48       157

    accuracy                           0.80      1200
   macro avg       0.65      0.76      0.68      1200
weighted avg       0.87      0.80      0.82      1200



In [13]:
roc_auc_score(y_test, log_preds)

0.7568381261793821

In [68]:
# Check performance on validation set:
preds_val = log_reg.predict(X_val)
cm_logval = confusion_matrix(y_val, preds_val)
print(cm_logval)
print(classification_report(y_val, preds_val))

[[719 158]
 [ 11  20]]
              precision    recall  f1-score   support

           0       0.98      0.82      0.89       877
           1       0.11      0.65      0.19        31

    accuracy                           0.81       908
   macro avg       0.55      0.73      0.54       908
weighted avg       0.96      0.81      0.87       908



In [69]:
roc_auc_score(y_val, preds_val)

0.7325008276014271

### Multinomial NB

In [16]:
# Fit Multinomial NB with laplace correction:
mnb1 = MultinomialNB(alpha=1)
mnb1.fit(X_train_res,y_train_res)
mnb_preds = mnb1.predict(X_test)

cm_mnb = confusion_matrix(y_test, mnb_preds)
print(cm_mnb)
print(classification_report(y_test, mnb_preds))

[[885 163]
 [ 56  96]]
              precision    recall  f1-score   support

           0       0.94      0.84      0.89      1048
           1       0.37      0.63      0.47       152

    accuracy                           0.82      1200
   macro avg       0.66      0.74      0.68      1200
weighted avg       0.87      0.82      0.84      1200



In [17]:
roc_auc_score(y_test, mnb_preds)

0.7380222981116914

In [18]:
mnb_val = mnb1.predict(X_val)
confusion_matrix(y_val, mnb_val)

array([[725, 140],
       [ 11,  14]])

In [19]:
roc_auc_score(y_val, mnb_val)

0.6990751445086705

### Complement NB

In [None]:
# Fit Complement NB with laplace correction to account for imbalanced data:
cnb1 = ComplementNB(alpha = 1)
cnb1.fit(X_train_res,y_train_res)
cnb_preds = cnb1.predict(X_test)

cm_cnb = confusion_matrix(y_test, cnb_preds)
print(cm_cnb)
print(classification_report(y_test, cnb_preds))

In [None]:
roc_auc_score(y_test, cnb_preds)

### Grid-search Complement NB

In [None]:
# Gridsearch CV ComplementNB 
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],
         }

cnb_grid = GridSearchCV(ComplementNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
cnb_grid.fit(X_train_res, y_train_res)
cnb_gridpreds = cnb_grid.best_estimator_.predict(X_test)

print('Train Accuracy : %.3f'%cnb_grid.best_estimator_.score(X_train_res, y_train_res))
print('Test Accuracy : %.3f'%cnb_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%cnb_grid.best_score_)
print('Best Parameters : ',cnb_grid.best_params_)

In [None]:
cm_cnbgrid = confusion_matrix(y_test, cnb_gridpreds)
print(cm_cnbgrid)
print(classification_report(y_test, cnb_gridpreds))

In [None]:
roc_auc_score(y_test, cnb_gridpreds)

### Grid-search Multinomial NB

In [None]:
# Gridsearch CV MultinomialNB 

mnb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
mnb_grid.fit(X_train_res, y_train_res)
mnb_gridpreds = mnb_grid.best_estimator_.predict(X_test)

print('Train Accuracy : %.3f'%mnb_grid.best_estimator_.score(X_train_res, y_train_res))
print('Test Accuracy : %.3f'%mnb_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%mnb_grid.best_score_)
print('Best Parameters : ',mnb_grid.best_params_)

In [None]:
cm_mnbgrid = confusion_matrix(y_test, mnb_gridpreds)
print(cm_mnbgrid)
print(classification_report(y_test, mnb_gridpreds))

In [None]:
roc_auc_score(y_test, mnb_gridpreds)

### Forcing balanced dataset:

In [None]:
df1 = df_train[df_train['label']==0].sample(n=len(df_train[df_train['label']==1]))
df2 = df_train[df_train['label']==1]

In [None]:
balanced_df = pd.concat([df1,df2])
del df1, df2

In [None]:
# Create train/test split
X_bal = balanced_df.drop('label', axis = 1)
y_bal = balanced_df.loc[:,'label']
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_bal, y_bal, test_size=0.30, stratify = y_bal)
print('Train/Test Sizes : ', X_train_bal.shape, X_test_bal.shape, y_train_bal.shape, y_test_bal.shape)

In [None]:
log_reg2 = LogisticRegression()
log_reg2.fit(X_train_bal ,y_train_bal)
log_preds2 = log_reg2.predict(X_test_bal)
cm_log2 = confusion_matrix(y_test_bal, log_preds2)
print(cm_log2)
print(classification_report(y_test_bal, log_preds2))

In [None]:
roc_auc_score(y_test_bal, log_preds2)

In [None]:
logpreds2_val = log_reg2.predict(X_val)
confusion_matrix(y_val, logpreds2_val)

In [None]:
roc_auc_score(y_val,logpreds2_val)

In [None]:
cnb_grid2 = GridSearchCV(ComplementNB(), param_grid=params, n_jobs=-1, cv=10, verbose=5)
cnb_grid2.fit(X_train, y_train)
cnb_gridpreds2 = cnb_grid2.best_estimator_.predict(X_test)

print('Train Accuracy : %.3f'%cnb_grid2.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%cnb_grid2.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%cnb_grid2.best_score_)
print('Best Parameters : ',cnb_grid2.best_params_)

In [None]:
cm_cnbgrid2 = confusion_matrix(y_test, cnb_gridpreds2)
print(cm_cnbgrid2)
print(classification_report(y_test, cnb_gridpreds2))

In [None]:
roc_auc_score(y_test, cnb_gridpreds2)

In [None]:
# Plot AUC/ROC for Complement NB:

cnb2_roc_auc = roc_auc_score(y_test, cnb_gridpreds2)
fpr, tpr, thresholds = roc_curve(y_test, cnb_grid2.best_estimator_.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='CNB (area = %0.2f)' % cnb2_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()