## Obtaining Data

In [13]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [14]:
data = pd.read_csv('data.csv',encoding='utf-8')
data = data.astype(str)
tweets = pd.DataFrame(data['tweet_text'])

## Scrubbing/Cleaning Data

### DataFrame treatment

In [15]:
#preview of data
data.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [16]:
#stats on data
data.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,9288.0,9288.0,9288
unique,9168.0,10.0,5
top,,,No emotion toward brand or product
freq,27.0,5997.0,5389


In [24]:
# Get names of indexes for which column Age has value 30
indexNames = data[data['emotion_in_tweet_is_directed_at'] == 'nan' ].index
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)

data.dropna()


Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9272,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
9274,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
9275,Diller says Google TV &quot;might be run over ...,Other Google product or service,Negative emotion
9280,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion


In [55]:
data.rename(columns={'emotion_in_tweet_is_directed_at':'Brand','is_there_an_emotion_directed_at_a_brand_or_product':'Sentiment'})

Unnamed: 0,tweet_text,Brand,Sentiment
0,wesley g iphon hr tweet rise austin dead need ...,iPhone,Negative emotion
1,jessed know fludapp awesom ipad iphon app like...,iPad or iPhone App,Positive emotion
2,swonderlin wait ipad also sale sxsw,iPad,Positive emotion
3,sxsw hope year festiv crashi year iphon app sxsw,iPad or iPhone App,Negative emotion
4,sxtxstate great stuff fri sxsw marissa mayer g...,Google,Positive emotion
...,...,...,...
9272,mention pr guy convinc switch back iphon great...,iPhone,Positive emotion
9274,quot papyru sort like ipad quot nice lol sxsw ...,iPad,Positive emotion
9275,diller say googl tv quot might run playstat xb...,Other Google product or service,Negative emotion
9280,alway use camera iphon b c imag stabil mode su...,iPad or iPhone App,Positive emotion


In [56]:
#stats without nan in tweet column
data.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,3291,3291,3291
unique,3247,9,4
top,rt mention marissa mayer googl connect digit a...,iPad,Positive emotion
freq,4,946,2672


In [57]:
#Changing column info to assist with encoding later
product_dict = {"iPhone":'Apple','iPad or iPhone App': 'Apple','iPad':'Apple','nan':'none','Android':'Google','Android App':'Google','Other Google product or service':'Google','Other Apple product or service':'Apple'}
sentiment_dict = {'Negative emotion': 'Negative','Positive emotion':'Positive','No emotion toward brand or product':'Neutral', "I can't tell": 'Neutral'}
data = data.replace({"Brand": product_dict})
data = data.replace({'Sentiment':sentiment_dict})


In [58]:
data.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,wesley g iphon hr tweet rise austin dead need ...,iPhone,Negative emotion
1,jessed know fludapp awesom ipad iphon app like...,iPad or iPhone App,Positive emotion
2,swonderlin wait ipad also sale sxsw,iPad,Positive emotion
3,sxsw hope year festiv crashi year iphon app sxsw,iPad or iPhone App,Negative emotion
4,sxtxstate great stuff fri sxsw marissa mayer g...,Google,Positive emotion
7,sxsw start ctia around corner googleio hop ski...,Android,Positive emotion
8,beauti smart simpl idea rt madebymani thenextw...,iPad or iPhone App,Positive emotion
9,count day sxsw plu strong canadian dollar mean...,Apple,Positive emotion
10,excit meet samsungmobileu sxsw show sprint gal...,Android,Positive emotion
11,find amp start impromptu parti sxsw hurricanep...,Android App,Positive emotion


In [59]:
data.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,3291,3291,3291
unique,3247,9,4
top,rt mention marissa mayer googl connect digit a...,iPad,Positive emotion
freq,4,946,2672


### Twitter data

In [60]:
data.head(10) #preview of what data looks like
tweets = data['tweet_text']
len(tweets)

3291

In [61]:
data.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,3291,3291,3291
unique,3247,9,4
top,rt mention marissa mayer googl connect digit a...,iPad,Positive emotion
freq,4,946,2672


In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for tweet in tweets:
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    tweet = tweet.lower()
    tweet = tweet.split()
    ps = PorterStemmer()
    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jamaalsmith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data['tweet_text'] = corpus #so that dataframe has cleaned tweets

In [None]:
#Creation of Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, 1].values

In [None]:
##Tokenize data and then generate FreqDist
from nltk import word_tokenize
tokens = word_tokenize(','.join(map(str,corpus)))

In [None]:
#FreqDist
from nltk import FreqDist
freq = FreqDist(tokens)
most_used_words = freq.most_common(100)
most_used_words = pd.DataFrame(most_used_words, columns=['word','count'])
most_used_words.reset_index(drop=True,inplace=True)


## Exploratory Data Analysis

In [None]:
top10_words = most_used_words[:10]
top10_words.head()

plt.figure(figsize=(10,5))
sns.barplot(x='word',y='count',data=top10_words)
plt.title('Top 10 Common Phrases')

Based on the top twenty words found in our corpus, I conducted googles on some of these phrases to better understand when this data was collected. When I googled sxtxstate, I [found the following information](https://twitter.com/sxtxstate). With the discovery that Texas State's grad school program for communications has a hashtag dedicated to its coverage of the annual SXSW festival, we can assume that most of the tweets are from individuals that can be considered heavy users of technology. 

With the knowledge that these tweets were centered around the SXSW festival, I then noticed that what appeared to be two sets of names were in the top twenty list. One name appears to be Marissa Mayer and the other was Tim Reilli. I performed another Google and [found information about the following event in 2011.](https://www.mediabullseye.com/2011/03/sxsw-update-chasing-the-ideas-of-innovation/) This information further assists us with learning about our the author of the tweets we are reviewing because both presenters spoke to advances that their respective organizations had made with location based services at this forum.

I then examined the events that Google had planned for the 2011 SXSW festival. I undertook this query because I noticed that Google was in the top 20 most frequently used words. During the 2011 SXSW event, [Google had a plethora of events that touched on subjects such as recommendation engines and hybrid marketing.](https://googleblog.blogspot.com/2011/03/google-at-sxsw-2011-austin-here-we-come.html) Based on this quick research, one can assume that tweets related to Google will be commenting on the new technologies the firm was presenting at this conference.

The other name that appeared frequently was Matt Mullenwerg, creator of Wordpress. During the 2011 festival, this gentleman spoke about a new venture called JetPack.

Finally, it is worth noting some initial impressions about the tweet's larger context. After reading some of the tweets before preprocessing, I noticed that some of the tweets related to Apple appeared to be focused on the user experience people were having with apple products at SXSW. I then noticed that like Google, at this time, Apple was launching its iPad2. Thus, it is safe to assume that tweets would be a good reflection the sentiment that festival goers had related to these launches.

***

This analyis will not simply report back whether more individuals favored one company to another. Instead, its findings can provide insight into how users of their product's found their latest offerings when first presented with them at a technology conference. 


### Popularity of the Two Brands

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x="Brand",data=data)
plt.title('Popularity of Company based on Tweets')
plt.xlabel('Company')

Despite only having two inputs (Apple & Other Apple product or service) when compared to Google that had four inputs (Google,Android, Android App, Other Google product or service), Apple was discussed more in the tweets according to the data.

One might conclude that the launch of the iPad2 was a major event that Apple marketed well and built suspense amongst the tech community.

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='emotion_in_tweet_is_directed_at',hue='is_there_an_emotion_directed_at_a_brand_or_product',data=data)

In [None]:
data.columns

In [None]:
#from textblob import textblob
#data['polarity'] = data['tweet_text'].map(lambda text: TextBlob(text).sentiment.polarity)
data['review_len'] = data['tweet_text'].astype(str).apply(len)
data['word_count'] = data['tweet_text'].apply(lambda x: len(str(x).split()))

In [None]:
data.head()

## Modeling