## Obtaining Data

In [81]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [82]:
data = pd.read_csv('data.csv',encoding='utf-8')
data = data.astype(str)
tweets = pd.DataFrame(data['tweet_text'])

## Scrubbing/Cleaning Data

### DataFrame treatment

In [104]:
#preview of data
data.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,Positive
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,Positive
3,@sxsw I hope this year's festival isn't as cra...,Apple,Negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive
5,@teachntech00 New iPad Apps For #SpeechTherapy...,none,Neutral
6,,none,Neutral
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive
8,Beautifully smart and simple idea RT @madebyma...,Apple,Positive
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive


In [105]:
#stats on data
data.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,9288.0,9288,9288
unique,9168.0,7,5
top,,none,Neutral
freq,27.0,5997,5389


In [106]:
# Get names of indexes for which column Age has value 30
indexNames = data[data['tweet_text'] == 'nan' ].index
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)


In [107]:
#stats without nan in tweet column
data.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,9261,9261,9261
unique,9167,7,5
top,{link},none,Neutral
freq,20,5970,5388


In [108]:
#Changing column info to assist with encoding later
product_dict = {"iPhone":'Apple','iPad or iPhone App': 'Apple','iPad':'Apple','nan':'none'}
sentiment_dict = {'Negative emotion': 'Negative','Positive emotion':'Positive','No emotion toward brand or product':'Neutral'}
data = data.replace({"emotion_in_tweet_is_directed_at": product_dict})
data = data.replace({'is_there_an_emotion_directed_at_a_brand_or_product':sentiment_dict})


In [103]:
data.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,Positive
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,Positive
3,@sxsw I hope this year's festival isn't as cra...,Apple,Negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive
5,@teachntech00 New iPad Apps For #SpeechTherapy...,none,Neutral
6,,none,Neutral
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive
8,Beautifully smart and simple idea RT @madebyma...,Apple,Positive
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive


### Twitter data

In [87]:
data.head(10) #preview of what data looks like

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,Positive
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,Positive
3,@sxsw I hope this year's festival isn't as cra...,Apple,Negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive
8,Beautifully smart and simple idea RT @madebyma...,Apple,Positive
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive


In [93]:
# Approach influenced by Machine Learning Course taken on Udemy
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(data['tweet_text'])):
    review = re.sub('[^a-zA-Z]', ' ', data['tweet_text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jamaalsmith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [110]:
data['tweet_text'] = corpus #so that dataframe has cleaned tweets

In [95]:
#Creation of Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, 1].values

In [96]:
##Tokenize data and then generate FreqDist
from nltk import word_tokenize
tokens = word_tokenize(','.join(map(str,corpus)))

In [97]:
#FreqDist
from nltk import FreqDist
freq = FreqDist(tokens)
freq.most_common(100)

[('sxsw', 9665),
 (',', 9285),
 ('mention', 7134),
 ('link', 4329),
 ('ipad', 3099),
 ('rt', 2970),
 ('googl', 2673),
 ('appl', 2346),
 ('quot', 1702),
 ('iphon', 1602),
 ('store', 1530),
 ('new', 1091),
 ('app', 1053),
 ('austin', 976),
 ('launch', 838),
 ('amp', 836),
 ('circl', 692),
 ('social', 674),
 ('pop', 612),
 ('android', 600),
 ('today', 584),
 ('get', 538),
 ('open', 538),
 ('network', 489),
 ('line', 461),
 ('go', 442),
 ('via', 436),
 ('parti', 407),
 ('call', 404),
 ('free', 390),
 ('mobil', 359),
 ('sxswi', 343),
 ('come', 335),
 ('like', 333),
 ('use', 323),
 ('one', 321),
 ('time', 320),
 ('win', 316),
 ('check', 309),
 ('major', 309),
 ('day', 294),
 ('map', 280),
 ('w', 272),
 ('temporari', 266),
 ('possibl', 259),
 ('see', 258),
 ('need', 249),
 ('design', 240),
 ('look', 236),
 ('peopl', 231),
 ('make', 228),
 ('downtown', 225),
 ('great', 222),
 ('mayer', 219),
 ('popup', 212),
 ('know', 203),
 ('set', 197),
 ('marissa', 193),
 ('talk', 193),
 ('think', 191),
 ('

## Exploratory Data Analysis

## Modeling