## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [98]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from nltk.stem import PorterStemmer

In [54]:
data = pd.read_csv('data/tweets.csv')

In [55]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [57]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [74]:
data['emotion_in_tweet_is_directed_at'].value_counts(normalize=True)

iPad                               0.287451
Apple                              0.200851
iPad or iPhone App                 0.142814
Google                             0.130659
iPhone                             0.090246
Other Google product or service    0.089031
Android App                        0.024613
Android                            0.023701
Other Apple product or service     0.010635
Name: emotion_in_tweet_is_directed_at, dtype: float64

### Let's make an emotion column so we don't have to type out where the tweet is directed

In [59]:
data['emotion'] = data['emotion_in_tweet_is_directed_at']

In [60]:
data

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,iPhone
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,iPad or iPhone App
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,iPad
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,iPad or iPhone App
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,iPad
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,


### Make a list of items in the directed at so we can focus on the company

In [76]:
apple = ['iPad', 'Apple', 'iPad or iPhone App', 'iPhone', 'Other Apple product or service']
google = ['Google', 'Other Google product or service', 'Android App', 'Android']

In [77]:
data['company'] = data['emotion']

In [78]:
data['company'] = data['company'].replace(apple, 'Apple')

In [79]:
data['company'] = data['company'].replace(google, 'Google')

In [80]:
data['company'].value_counts(normalize=True)

Apple     0.731996
Google    0.268004
Name: company, dtype: float64

In [81]:
data

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,emotion,company
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,iPhone,Apple
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,iPad or iPhone App,Apple
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,iPad,Apple
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,iPad or iPhone App,Apple
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google,Google
...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,iPad,Apple
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,,
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,,
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,,


In [86]:
data[data['company']=='Apple']['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion                      1949
Negative emotion                       388
No emotion toward brand or product      65
I can't tell                             7
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [87]:
data[data['company']=='Google']['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion                      723
Negative emotion                      131
No emotion toward brand or product     26
I can't tell                            2
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [104]:
# Tweet Tokenizer, stopwords, stemmer
tokenizer = TweetTokenizer()
stopwords_list = stopwords.words('english')
stemmer = PorterStemmer()

In [108]:
# Define preprocess_text
def preprocess_text(text, tokenizer, stopwords_list, stemmer):
    # Standardize case (lowercase the text)
    lowered = text.lower()
    # Tokenize text using `tokenizer`
    tokens = tokenizer.tokenize(lowered)
    # Remove stopwords using `stopwords_list`
    stopped_tokens = [word for word in tokens if word not in stopwords_list]
    # Stem the tokenized text using `stemmer`
    stems = [stemmer.stem(token) for token in stopped_tokens]
    # Return the preprocessed text
    return stems
preprocess_text("This is an example sentence for preprocessing.", tokenizer, stopwords_list, stemmer)

['exampl', 'sentenc', 'preprocess', '.']

In [109]:
text_data = data.desc.apply(lambda x: preprocess_text(x, tokenizer, stopwords_list, stemmer))

AttributeError: 'DataFrame' object has no attribute 'desc'