# **NLP Over a SXSW Twitter Data Set**

## Package and Data Import

*Package Imports*

In [1]:
### Packages to Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.svm import SVC
from nltk.tokenize import RegexpTokenizer
import warnings
nltk.download('wordnet')
nltk.download('vader_lexicon')

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package wordnet to /Users/diego/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/diego/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


*Importing CSV File*

In [2]:
### Import Data and Change Column Names

df = pd.read_csv("data/tweets.csv")
df.columns = ['text', 'device', 'emotion']

## Pre-Data Cleaning Column Addition

### Create column that counts the amount of mentions in a tweet
0.0 --> 4173 | 1.0 --> 3251 | 2.0 --> 1218 | 3.0 --> 339 | 4.0 --> 80 | 5.0 --> 22 | 6.0 --> 5 | 7.0 --> 2 | 8.0 --> 2

In [3]:
df['mentions'] = df.text.str.count('@')

### Create a column that counts the amount of links in a tweet

We are also considering anything with '.com', 'http', 'bit.ly', '.co', and '{link}' as a link

0 --> 4807 | 1 --> 4086 | 2 --> 186 | 3 --> 186 | 4 --> 11 

In [4]:
df['links'] = 0

url_like_strings = ['{link}', '.com', 'http', 'bit.ly', '.co']
for s in url_like_strings:
    df['links'] = df.links + list(map(lambda x: str(x).count(s), df['text']))

In [5]:
df.links.value_counts()

0    4807
1    4068
2     186
3      21
4      11
Name: links, dtype: int64

## Data Cleaning

**Dropping non-significant values and rows**

In [6]:
### Dropping 'I can't tell' and 'Other' rows

df = df[df['emotion'] != "I can't tell"]

### Dropping blank 'text' rows

df = df.dropna(subset=['text'])

**Creating functions that clean the text data**

In [7]:
### Creating a function that makes all text lowercase for further analysis

def lower_case(text):
    text = text.lower()
    return text

### Creating a function that removes the use of via in the context of via hashtag or via mention (removes 80% of vias)

def remove_via(text):
    
    if 'via @' in text or 'via #' in text:
        text = text.replace('via', '')
    return text
    
### Creating a function that removes errant html syntax from the tweet (e.g. &amp; and &quot;)

def remove_html(text):
    
    text = text.replace('&amp;', '')
    text = text.replace('&quot;', '')
    return text

### Creating a function that removes urls or instances of '{link}' from the tweet

def remove_url(text):
    
    url_like_strings = ['{link}', '.com', 'http', 'bit.ly', '.co']
    text = text.split()
    for s in url_like_strings:
        text = [word for word in text if s not in word]
    text = ' '.join(text)
    return text

### Creating a function that removes words that contain a @ and rt (retweet) 
### as mentions would not be important in determining the emotion of a tweet

def remove_at_and_rt(text):
    text = text.split()
    text = [word for word in text if '@' not in word]
    text = [word for word in text if word != 'rt']
    text = ' '.join(text)
    return text

### Creating a function that removes '#SXSW' of any case type from the text

def remove_sxsw(text):
    text = text.split()
    text = [word for word in text if '#sxsw' not in word]
    text = ' '.join(text)
    return text

### Creating a function that uses a regex tokenizer to remove punctuation but ignores contraction apostrophes

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+\'?\w+')
    text = tokenizer.tokenize(text)
    text = ' '.join(text)
    return text

### Creating a function that removes stopwords from a specified list of stopwords

custom_stop_words = ['in','of','at','a','the']

def remove_stopwords(text, stop_words_list = set(stopwords.words('english'))):
    text = text.split()
    text = [word for word in text if word not in stop_words_list]
    text = ' '.join(text)
    return text

### Creating a function that removes non-ASCII characters

def remove_characters(text):
    text = text.encode('ascii', 'ignore').decode('ascii')
    return text

### Creating a function that lemmatizes words

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

### Creating a function that combines all of the above functions

def clean_text(text):
    text = lower_case(text)
    text = remove_via(text)
    text = remove_html(text)
    text = remove_url(text)
    text = remove_at_and_rt(text)
    text = remove_sxsw(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = remove_characters(text)
    text = lemmatize(text)
    return text

In [8]:
df.text[60]

"&quot;via @mention : {link} Guy Kawasaki talks 'Enchanted' at SXSW - HE knows his stuff! #books #internet #Apple #sxsw  &quot;"

In [9]:
print(clean_text(df['text'][60]))

guy kawasaki talk enchanted sxsw know stuff book internet apple


In [10]:
### Applying the clean_text function to the 'text' column and creating a santiy check csv

df_clean = df.copy()
df_clean['text'] = df_clean['text'].apply(clean_text)
df_clean.to_csv('data/cleaned_tweets.csv')

## Feature Engineering

**Creating 'device_type'**

In [13]:
### Deliniating between Google and Apple

google_tweets = ['Google', 'Other Google product or service', 'Andriod App', 'Andriod']
apple_tweets = ['Apple', 'Other Apple product or service', 'Apple App', 'iPhone', 'iPad', 'iPad or iPhone App']

### Creating a new column for google vs. apple vs. unknown

df['device_type'] = np.where(df['device'].isin(google_tweets), 'Google', 
                    np.where(df['device'].isin(apple_tweets), 'Apple', 
                             'Unknown'))

**Approximating 'Company' Values based off of 'text' and 'device_type'**

In [14]:
### Creating a new column for 'Google' and 'Apple' based on device type and key words in the 'text' column

google_key_words = ["Google", "Android", "Pixel", "Circles", "Droid", "Galaxy S", "Realtime", "Maps", "Google Maps", "Circle" ]

apple_key_words = ["Apple", "iPhone", "iPad", "Mac", "iMac", "iPod", "iTunes", "iWatch", "iMessage", "iCloud", "iBook", "iMac", 
                   "app_store", "app store", "ios", "ios4", "ios4.1", "ios4.2", "iphone app", "3g", "ios"]
                  
df['Google'] = np.where(df['device_type'] == 'Google', True, 
               np.where(df['text'].str.lower().str.contains('|'.join(google_key_words), case=False), True, 
               False))

df['Apple'] = np.where(df['device_type'] == 'Apple', True,
              np.where(df['text'].str.lower().str.contains('|'.join(apple_key_words), case=False), True,
              False))

### Create new column 'both' that is true if both Google and Apple are true

df['both'] = np.where((df['Google'] == True) & (df['Apple'] == True), True, False)

### Dropping rows where both Google and Apple are true and where Google and Apple are both false

df = df[df['both'] == False]
df = df[df['Google'] != df['Apple']]
df = df.drop(columns=['both'])


**Establishing a VADER Sentiment Score**

In [15]:
### Creating new columns in the dataframe which append 'pos', 'neg', and 'neu' using VADER sentiment analysis

sid = SentimentIntensityAnalyzer()
df['sentiment'] = df['text'].apply(lambda x: sid.polarity_scores(x))
df = pd.concat([df.drop(['sentiment'], axis=1), df['sentiment'].apply(pd.Series)], axis=1)

**Creating an 'emphasis' column that scores how many exclamation points, question marks, and capital letters are in the text**

In [16]:
### Creating an 'punc_emphasis' column that scores how many exclamation points and question marks are in the text

df['punc_emphasis'] = df['text'].apply(lambda x: sum([1 for char in x if char in ['!', '?']]))

### Creating a 'capt_emphasis' column that scores how many capitalized words are in the text

df['capt_emphasis'] = df['text'].apply(lambda x: sum([1 for word in x.split() if word.isupper()]))

## Modeling

**First Simple Model - Count Vectorizer / Decision Tree / No Features**

In [17]:
### Performing a train test split on the data, only including the 'text' and 'emotion' columns

X1 = df['text']
y1 = df['emotion']

### Adding the tokenizer to the 'text' column in the X features

X1 = X1.apply(clean_text)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, test_size=0.2, random_state=1337)


### Fitting the pipeline to the training data and printing the training and validation accuracy scores

baseline.fit(X_train_1, y_train_1)
print('Training Accuracy Score:', baseline.score(X_train_1, y_train_1))
print('Validation Accuracy Score:', cross_val_score(baseline, X_train_1, y_train_1, cv=5).mean())

### Generate a confusion matrix on the validation data and plot with seaborn

y_pred_1 = baseline.predict(X_test_1)
cm = confusion_matrix(y_test_1, y_pred_1)
cm_df = pd.DataFrame(cm, columns=['Predicted Negative', 'Predicted Neutral', 'Predicted Positive'], index=['Actual Negative', 'Actual Neutral', 'Actual Positive'])
plt.figure(figsize=(, 10))
sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
plt.title('Decision Tree Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

SyntaxError: invalid syntax (3727480503.py, line 24)

**Second Model - Count Vectorizer / Support Vector Machine / Added Sentiment and Emphasis Score**

In [None]:
### Performing a train test split on the data, including 'text', VADER scores, and 'emphasis' columns for the X features

X2 = df.drop(columns=['emotion', 'device', 'device_type', 'Google', 'Apple'])
y2 = df['emotion']

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.2, random_state=1337)

### Applying the tokenizer to the 'text' column in the X features

X_train_2['text'] = X_train_2['text'].apply(clean_text)

### Creating an imbpalance-learn pipeline that uses SMOTE to oversample the minority classes, vectorizes the data, and then uses a Support Vector Machine to predict the emotion of a tweet

baseline2 = imbpipeline([
    ('smote', SMOTE(sampling_strategy='minority', random_state=1337)),
    ('cvec', CountVectorizer(encoding = 'iso-8859-1', lowercase = False)),
    ('svm', SVC(random_state=1337))
])

### Fitting the pipeline to the training data and printing the training and validation accuracy scores

baseline2.fit(X_train_2, y_train_2)
print('Training Accuracy Score:', baseline2.score(X_train_2, y_train_2))
print('Validation Accuracy Score:', cross_val_score(baseline2, X_train_2, y_train_2, cv=5).mean())

### Generate a confusion matrix on the validation data and plot with seaborn

y_pred_2 = baseline2.predict(X_test_2)
cm = confusion_matrix(y_test_2, y_pred_2)
cm_df = pd.DataFrame(cm, columns=['Predicted Negative', 'Predicted Neutral', 'Predicted Positive'], index=['Actual Negative', 'Actual Neutral', 'Actual Positive'])
plt.figure(figsize=(8, 8))
sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
plt.title('SVM Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()





In [None]:
print(type(X_train_2))
print(type(y_train_2))

**Test Data - Transforming**

In [None]:
### Applying preprocessing to the test data

X_test['text'] = X_test['text'].apply(clean_text)
X_test['text'] = X_test['text'].apply(lambda x: ' '.join(x))

### Transforming the test data

X_test_vectorized = tfidf.transform(X_test['text'])

**Test Data - Comparing Model Scores**

In [None]:
### Fitting the test data to all models developed above

rfc_pred = rfc.predict(X_test_vectorized)
nb_pred = nb.predict(X_test_vectorized)
lr_pred = lr.predict(X_test_vectorized)

### Print the accuracy, precision, recall, and f1 score for each model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Random Forest Classifier')
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('Precision: ', precision_score(y_test, rfc_pred, average='weighted'))
print('Recall: ', recall_score(y_test, rfc_pred, average='weighted'))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('-------------------------------------------------')
print('Naive Bayes Classifier')
print('Accuracy: ', accuracy_score(y_test, nb_pred))
print('Precision: ', precision_score(y_test, nb_pred, average='weighted'))
print('Recall: ', recall_score(y_test, nb_pred, average='weighted'))
print('F1 Score: ', f1_score(y_test, nb_pred, average='weighted'))
print('-------------------------------------------------')
print('Logistic Regression Classifier')
print('Accuracy: ', accuracy_score(y_test, lr_pred))
print('Precision: ', precision_score(y_test, lr_pred, average='weighted'))
print('Recall: ', recall_score(y_test, lr_pred, average='weighted'))
print('F1 Score: ', f1_score(y_test, lr_pred, average='weighted'))

### Print the confusion matrix for each model using seaborn's heatmap

fig, ax = plt.subplots(1, 3, figsize=(15, 5))
sns.heatmap(confusion_matrix(y_test, rfc_pred), annot=True, ax=ax[0], fmt='d')
ax[0].set_title('Random Forest Classifier')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(confusion_matrix(y_test, nb_pred), annot=True, ax=ax[1], fmt='d')
ax[1].set_title('Naive Bayes Classifier')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
sns.heatmap(confusion_matrix(y_test, lr_pred), annot=True, ax=ax[2], fmt='d')
ax[2].set_title('Logistic Regression Classifier')
ax[2].set_xlabel('Predicted')
ax[2].set_ylabel('Actual')
plt.show()
