# Phase 4 project

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import nltk
from nltk import pos_tag
from sklearn.naive_bayes import MultinomialNB

%matplotlib inline

In [2]:
data = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='latin1')

In [3]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
# Change the column names so it can be more readable

data.columns = ['tweet', 'product', 'sentiment']

In [5]:
data.head()

Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [6]:
data.sentiment.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: sentiment, dtype: int64

In [7]:
df = data[(data['sentiment'] == "Negative emotion") | (data['sentiment'] == "Positive emotion")]

In [8]:
df = data.dropna(axis=0)

In [9]:
df

Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
9080,Diller says Google TV &quot;might be run over ...,Other Google product or service,Negative emotion
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion


In [10]:
df['product'].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: product, dtype: int64

In [11]:
df.isnull().sum()

tweet        0
product      0
sentiment    0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3291 entries, 0 to 9088
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      3291 non-null   object
 1   product    3291 non-null   object
 2   sentiment  3291 non-null   object
dtypes: object(3)
memory usage: 102.8+ KB


Pre-processing

In [13]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [14]:
nltk.download('wordnet')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\natek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\natek\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\natek\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [15]:
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [16]:
df

Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
9080,Diller says Google TV &quot;might be run over ...,Other Google product or service,Negative emotion
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion


In [17]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenizer = RegexpTokenizer(pattern)
lemmatizer = nltk.stem.WordNetLemmatizer()

df['tweet_edited'] = df['tweet']
df['tweet_edited'] = df['tweet_edited'].apply(str.lower)
df['tweet_edited'] = df['tweet_edited'].apply(str.split)
df['tweet_edited'] = df['tweet_edited'].apply(lambda x: " ".join([n for n in x if '@' not in n]))
df['tweet_edited'] = df['tweet_edited'].apply(tokenizer.tokenize)
df['tweet_edited'] = df['tweet_edited'].apply(pos_tag)
df['tweet_edited'] = df['tweet_edited'].apply(lambda x: [(n[0], get_wordnet_pos(n[1])) for n in x])
df['tweet_edited'] = df['tweet_edited'].apply(lambda x: " ".join([lemmatizer.lemmatize(n[0], (n[1])) for n in x]))

df['product'] = df['product'].apply(str.strip)
df['Company'] = df['product']
df['Company'] = df['Company'].replace({
    'iPad':'Apple',
    'iPad or iPhone App':'Apple',
    'iPhone':'Apple',
    'Other Apple product or service':'Apple',
    'Other Google product or service':'Google',
    'Android App':'Google',
    'Android':'Google'   
})



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet_edited'] = df['tweet']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet_edited'] = df['tweet_edited'].apply(str.lower)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet_edited'] = df['tweet_edited'].apply(str.split)
A value is trying to be set on a copy of a slice from a Da

Split the data into training and test data

In [18]:
# Train test split
X = df['tweet_edited']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)
X_train

4312    the new for the android be so great can't wait...
8592    sound intrigue rt lot of chatter around google...
3964    just win an html t shirt from google for say t...
2027    be there any way of delete an app that won't s...
4041    new social network may debut at sxsw google ci...
                              ...                        
7604    apple store sxsw line be move at the front the...
2392    on it second day in business the apple pop up ...
2851    sxsw i need the best android dev here html js ...
545     google be give free google lab laptop to open ...
2776        genius idea rt apple popup store at sxsw link
Name: tweet_edited, Length: 2303, dtype: object

Try a Random Forest Classifier.  Try both the Count Vectorizer and the TFidfVectorizer

In [29]:
sw = stopwords.words('english')

vectorizer = CountVectorizer(stop_words=sw, ngram_range=[1, 1])
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)

0.8380566801619433

In [24]:
new_df = pd.DataFrame(X_train_vect.toarray(), columns=vectorizer.get_feature_names())
new_df['apple'].value_counts()

0    1652
1     579
2      68
3       4
Name: apple, dtype: int64

In [22]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
sw = stopwords.words('english')

vectorizer = TfidfVectorizer(stop_words=sw, ngram_range=[1, 1])
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)

0.8319838056680162

Try a Random Forest Classifier.  Try both the Count Vectorizer and the TFidfVectorizer

In [None]:

sw = stopwords.words('english')

vectorizer = CountVectorizer(stop_words=sw)
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
sw = stopwords.words('english')

vectorizer = TfidfVectorizer(token_pattern=pattern, stop_words=sw)
vectorizer.fit(X_train)
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_vect, y_train)
model.score(X_test_vect, y_test)