# **NLP Over a SXSW Twitter Data Set**

## Package and Data Import

*Package Imports*

In [34]:
### Packages to Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.compose import ColumnTransformer
import warnings
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package wordnet to /Users/diego/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/diego/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

*Importing CSV File*

In [35]:
### Import Data and Change Column Names

df = pd.read_csv("data/tweets.csv")
df.columns = ['text', 'device', 'emotion']

## Data Cleaning

**Dropping non-significant values and rows**

In [36]:
### Dropping 'I can't tell' and 'Other' rows

df = df[df['emotion'] != "I can't tell"]

### Dropping blank 'text' rows

df = df.dropna(subset=['text'])

**Creating functions that clean the text data**

In [37]:
### Creating a function that removes words that begin with @, as mentions would not be important in determining the emotion of a tweet

def remove_at(text):
    text = text.split()
    text = [word for word in text if not word.startswith('@')]
    text = ' '.join(text)
    return text

### Creating a function that makes all text lowercase for further analysis

def lower_case(text):
    text = text.lower()
    return text

### Creating a function that removes all punctuation with the exception of ! and ? as they may be important in determining the emotion of a tweet

def remove_punctuation(text):
    text = text.split()
    text = [word for word in text if not word.startswith('!') and not word.startswith('?')]
    text = ' '.join(text)
    return text

### Creating a function that removes stopwords from a specified list of stopwords

custom_stop_words = ['in','of','at','a','the']

def remove_stopwords(text):
    text = text.split()
    text = [word for word in text if word not in custom_stop_words]
    text = ' '.join(text)
    return text

### Creating a function that removes non-ASCII characters

def remove_characters(text):
    text = text.encode('ascii', 'ignore').decode('ascii')
    return text

### Creating a function that lemmatizes words

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

### Creating a function takes the tokenized text and returns a string of words

def tokenize_to_string(text):
    tknzr = TweetTokenizer()
    text = tknzr.tokenize(text)
    text = ' '.join(text)
    return text

### Creating a function that removes '#SXSW' of any case type from the text

def remove_sxsw(text):
    text = text.split()
    text = [word for word in text if not word.startswith('#sxsw') and not word.startswith('#SXSW')]
    text = ' '.join(text)
    return text

### Creating a function that combines all of the above functions

def clean_text(text):
    text = remove_at(text)
    text = lower_case(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = remove_characters(text)
    text = lemmatize(text)
    text = tokenize_to_string(text)
    text = remove_sxsw(text)
    return text

## Feature Engineering

**Creating 'device_type'**

In [38]:
### Deliniating between Google and Apple

google_tweets = ['Google', 'Other Google product or service', 'Andriod App', 'Andriod']
apple_tweets = ['Apple', 'Other Apple product or service', 'Apple App', 'iPhone', 'iPad', 'iPad or iPhone App']

### Creating a new column for google vs. apple vs. unknown

df['device_type'] = np.where(df['device'].isin(google_tweets), 'Google', 
                    np.where(df['device'].isin(apple_tweets), 'Apple', 
                             'Unknown'))

**Approximating 'Company' Values based off of 'text' and 'device_type'**

New dataFrame has 7943 rows

Apple: 5241

Google: 2702

In [39]:
### Creating a new column for 'Google' and 'Apple' based on device type and key words in the 'text' column

google_key_words = ["Google", "Android", "Pixel", "Circles", "Droid", "Galaxy S", "Realtime", "Maps", "Google Maps", "Circle" ]

apple_key_words = ["Apple", "iPhone", "iPad", "Mac", "iMac", "iPod", "iTunes", "iWatch", "iMessage", "iCloud", "iBook", "iMac", "app_store", "app store", "ios", "ios4", "ios4.1", "ios4.2", "iphone app", "3g", "ios"]
                  
df['Google'] = np.where(df['device_type'] == 'Google', True, 
               np.where(df['text'].str.lower().str.contains('|'.join(google_key_words), case=False), True, 
               False))

df['Apple'] = np.where(df['device_type'] == 'Apple', True,
              np.where(df['text'].str.lower().str.contains('|'.join(apple_key_words), case=False), True,
              False))

### Create new column 'both' that is true if both Google and Apple are true

df['both'] = np.where((df['Google'] == True) & (df['Apple'] == True), True, False)

### Dropping rows where both Google and Apple are true and where Google and Apple are both false

df = df[df['both'] == False]
df = df[df['Google'] != df['Apple']]
df = df.drop(columns=['both'])


**Establishing a VADER Sentiment Score**

In [40]:
### Creating new columns in the dataframe which append 'pos', 'neg', and 'neu' using VADER sentiment analysis

sid = SentimentIntensityAnalyzer()
df['sentiment'] = df['text'].apply(lambda x: sid.polarity_scores(x))
df = pd.concat([df.drop(['sentiment'], axis=1), df['sentiment'].apply(pd.Series)], axis=1)

In [41]:
df.head()

Unnamed: 0,text,device,emotion,device_type,Google,Apple,neg,neu,pos,compound
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,Apple,False,True,0.203,0.797,0.0,-0.68
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,Apple,False,True,0.0,0.576,0.424,0.91
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,Apple,False,True,0.0,1.0,0.0,0.0
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,Apple,False,True,0.0,0.663,0.337,0.7269
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,Google,True,False,0.0,0.796,0.204,0.6249


**Creating an 'emphasis' column that scores how many exclamation points, question marks, and capital letters are in the text**

In [42]:
### Creating an 'emphasis' column that scores how many exclamation points, question marks, and capital letters are in the text

df['emphasis'] = df['text'].apply(lambda x: sum([1 for char in x if char in ['!', '?']])) + \
                 df['text'].apply(lambda x: sum([1 for char in x if char.isupper()]))

In [43]:
pd.set_option('display.max_colwidth', None)

## Modeling

**First Simple Model - Count Vectorizer / Decision Tree / No Features**

In [47]:
### Performing a train test split on the data, only including the 'text' and 'emotion' columns

X1 = df['text']
y1 = df['emotion']

### Adding the tokenizer to the 'text' column in the X features

X1 = X1.apply(clean_text)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, test_size=0.2, random_state=1337)

### Creating an imbpalance-learn pipeline that uses SMOTE to oversample the minority classes and then uses a Decision Tree to predict the emotion of a tweet

baseline = imbpipeline([
    ('cvec', CountVectorizer(encoding = 'iso-8859-1', lowercase = False)),
    ('smote', SMOTE(sampling_strategy='minority', random_state=1337)),
    ('dt', DecisionTreeClassifier(random_state=1337, max_depth=5))
])

### Fitting the pipeline to the training data and printing the training and validation accuracy scores

baseline.fit(X_train_1, y_train_1)
print('Training Accuracy Score:', baseline.score(X_train_1, y_train_1))
print('Validation Accuracy Score:', cross_val_score(baseline, X_train_1, y_train_1, cv=5).mean())
print('Test Accuracy Score:', cross_val_score(baseline, X_test_1, y_test_1, cv=5).mean())

### Import confusion_matrix and print the confusion matrix for the validation data

from sklearn.metrics import confusion_matrix

y_pred_1 = baseline.predict(X_test_1)
confusion_matrix(y_test_1, y_pred_1)



Training Accuracy Score: 0.43877872206484103
Validation Accuracy Score: 0.46113271836299774
Test Accuracy Score: 0.45938733805527454


array([[ 78,  38,   0],
       [312, 565,  12],
       [237, 325,  22]])

In [62]:
cvec = CountVectorizer(encoding = 'iso-8859-1', lowercase = False)
t_fit = cvec.fit_transform(df.text)
t_fit

<1x11225 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [45]:
df.head()

Unnamed: 0,text,device,emotion,device_type,Google,Apple,neg,neu,pos,compound,emphasis
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,Apple,False,True,0.203,0.797,0.0,-0.68,16
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,Apple,False,True,0.0,0.576,0.424,0.91,11
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,Apple,False,True,0.0,1.0,0.0,0.0,7
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,Apple,False,True,0.0,0.663,0.337,0.7269,2
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,Google,True,False,0.0,0.796,0.204,0.6249,14


**Second Model - Count Vectorizer / Logistic Regression / Added Sentiment and Emphasis Score**

In [50]:
### Performing a train test split on the data, including 'text', VADER scores, and 'emphasis' columns
X2 = df.drop(columns=['emotion', 'device', 'device_type', 'Google', 'Apple'])
y2 = df['emotion']

### Applying the tokenizer to the 'text' column in the X features

X2['text'] = X2['text'].apply(clean_text)

### Performing a train test split on the X2 and Y2 data

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.2, random_state=1337)

### Creating an imbpalance-learn pipeline that uses SMOTE to oversample the minority classes and then uses a Logistic Regression to predict the emotion of a tweet

# baseline2 = imbpipeline([
#     ('cvec', CountVectorizer(encoding = 'iso-8859-1', lowercase = False)),
#     ('smote', SMOTE(sampling_strategy='minority', random_state=1337)),
#     ('lr', LogisticRegression(random_state=1337, max_iter=1000))
# ])



### Fitting the pipeline to the training data and printing the training and validation accuracy scores

# baseline2.fit(X_train_2, y_train_2)
# print('Training Accuracy Score:', baseline2.score(X_train_2, y_train_2))
# print('Validation Accuracy Score:', cross_val_score(baseline2, X_train_2, y_train_2, cv=5).mean())



In [51]:
print(X_train_2.shape)
print(y_train_2.shape)

(6354, 6)
(6354,)


In [52]:
X_train_2

Unnamed: 0,text,neg,neu,pos,compound,emphasis
2687,google map app save my life on regular basis rt map ha 150 million users . 40 % user are mobile,0.000,1.000,0.000,0.0000,4
5028,"rt google to launch major new social network called circles , possibly today { link }",0.000,1.000,0.000,0.0000,12
3215,nice being able to use just usb charging cord with out plug to add more juice to my iphone . #dfw,0.000,0.882,0.118,0.4215,11
7888,"so jealous who is team #android event #androidsxsw . get some swag , girl ! ! !",0.208,0.792,0.000,-0.6663,7
796,"google to launch major new social network called circles , possibly today - { link } via",0.000,1.000,0.000,0.0000,10
...,...,...,...,...,...,...
3739,"tried to initiate carpooling ridonkulous taxi line , geek all silent . i bet everyone would do it if there wa an iphone app for it .",0.055,0.945,0.000,-0.1027,3
1458,oh snap ! ! ! an 80 party hosted by google !,0.000,0.699,0.301,0.5951,13
992,brown paper on window a line grows popup apple store by gold's gym sixth & congress . { link },0.000,1.000,0.000,0.0000,13
218,hobo with shotgun iphone game { link },0.000,1.000,0.000,0.0000,17


**Fitting the training data to a Logistic Regressions Classifier**

In [13]:
### Fitting the training data to a logistic regression classifier with {'C': 100, 'penalty': 'l2'}

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100, penalty='l2', random_state=1337, max_iter=1000)
lr.fit(X_train_vectorized, y_train)

### Cross validating the model

cross_val_score(lr, X_train_vectorized, y_train, cv=5)

NameError: name 'X_train_vectorized' is not defined

**Test Data - Transforming**

In [63]:
### Applying preprocessing to the test data

X_test['text'] = X_test['text'].apply(clean_text)
X_test['text'] = X_test['text'].apply(lambda x: ' '.join(x))

### Transforming the test data

X_test_vectorized = tfidf.transform(X_test['text'])

NameError: name 'X_test' is not defined

**Test Data - Comparing Model Scores**

In [None]:
### Fitting the test data to all models developed above

rfc_pred = rfc.predict(X_test_vectorized)
nb_pred = nb.predict(X_test_vectorized)
lr_pred = lr.predict(X_test_vectorized)

### Print the accuracy, precision, recall, and f1 score for each model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Random Forest Classifier')
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('Precision: ', precision_score(y_test, rfc_pred, average='weighted'))
print('Recall: ', recall_score(y_test, rfc_pred, average='weighted'))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('-------------------------------------------------')
print('Naive Bayes Classifier')
print('Accuracy: ', accuracy_score(y_test, nb_pred))
print('Precision: ', precision_score(y_test, nb_pred, average='weighted'))
print('Recall: ', recall_score(y_test, nb_pred, average='weighted'))
print('F1 Score: ', f1_score(y_test, nb_pred, average='weighted'))
print('-------------------------------------------------')
print('Logistic Regression Classifier')
print('Accuracy: ', accuracy_score(y_test, lr_pred))
print('Precision: ', precision_score(y_test, lr_pred, average='weighted'))
print('Recall: ', recall_score(y_test, lr_pred, average='weighted'))
print('F1 Score: ', f1_score(y_test, lr_pred, average='weighted'))

### Print the confusion matrix for each model using seaborn's heatmap

fig, ax = plt.subplots(1, 3, figsize=(15, 5))
sns.heatmap(confusion_matrix(y_test, rfc_pred), annot=True, ax=ax[0], fmt='d')
ax[0].set_title('Random Forest Classifier')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(confusion_matrix(y_test, nb_pred), annot=True, ax=ax[1], fmt='d')
ax[1].set_title('Naive Bayes Classifier')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
sns.heatmap(confusion_matrix(y_test, lr_pred), annot=True, ax=ax[2], fmt='d')
ax[2].set_title('Logistic Regression Classifier')
ax[2].set_xlabel('Predicted')
ax[2].set_ylabel('Actual')
plt.show()
