# Phase 4 project

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

%matplotlib inline

In [64]:
data = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='latin1')

In [65]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [66]:
# Change the column names so it can be more readable

data.columns = ['tweet', 'product', 'sentiment']

In [67]:
data.head()

Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [68]:
data.sentiment.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: sentiment, dtype: int64

In [69]:
df = data[(data['sentiment'] == "Negative emotion") | (data['sentiment'] == "Positive emotion")]

In [70]:
df = data.dropna(axis=0)

In [71]:
df

Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
9080,Diller says Google TV &quot;might be run over ...,Other Google product or service,Negative emotion
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion


In [72]:
df['product'].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: product, dtype: int64

In [73]:
df.isnull().sum()

tweet        0
product      0
sentiment    0
dtype: int64

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3291 entries, 0 to 9088
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      3291 non-null   object
 1   product    3291 non-null   object
 2   sentiment  3291 non-null   object
dtypes: object(3)
memory usage: 102.8+ KB


Split the data to prevent data leakage

In [84]:
# Train test split
X = df['tweet']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state=1)


Try a Random Forest Classifier.  Try both the Count Vectorizer and the TFidfVectorizer

In [100]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
sw = stopwords.words('english')

vectorizer = CountVectorizer(token_pattern=pattern, stop_words=sw)
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)

0.8272569444444444

In [101]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
sw = stopwords.words('english')

vectorizer = TfidfVectorizer(token_pattern=pattern, stop_words=sw)
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)

0.8237847222222222

Try a Random Forest Classifier.  Try both the Count Vectorizer and the TFidfVectorizer

In [102]:
from sklearn.naive_bayes import MultinomialNB
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
sw = stopwords.words('english')

vectorizer = CountVectorizer(token_pattern=pattern, stop_words=sw)
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)

0.8224826388888888

In [103]:
from sklearn.naive_bayes import MultinomialNB
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
sw = stopwords.words('english')

vectorizer = TfidfVectorizer(token_pattern=pattern, stop_words=sw)
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)

0.8081597222222222