# Business Problem

**Build a model that can rate the sentiment of a Tweet based on its content.**

# Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='latin1')

In [3]:
df.head(50)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


**A plot for the most common hashtags**

**Don't need the product column at this time since we're focused on the sentiment of the tweet and not specifically the product**

In [4]:
df_1 = df.drop('emotion_in_tweet_is_directed_at', axis=1)

In [5]:
# df_1 = df.copy()

In [6]:
df_1.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product':'target'}, 
          inplace=True)
df_1.head()

Unnamed: 0,tweet_text,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [7]:
df_1['target'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: target, dtype: int64

# Isolate certain targets

In [17]:
sub = "No emotion toward brand or product"
df1 = df_1[df_1['target'].str.contains(sub)]
df1

Unnamed: 0,tweet_text,target
5,@teachntech00 New iPad Apps For #SpeechTherapy...,No emotion toward brand or product
6,,No emotion toward brand or product
16,Holler Gram for iPad on the iTunes App Store -...,No emotion toward brand or product
32,"Attn: All #SXSW frineds, @mention Register fo...",No emotion toward brand or product
33,Anyone at #sxsw want to sell their old iPad?,No emotion toward brand or product
...,...,...
9087,"@mention Yup, but I don't have a third app yet...",No emotion toward brand or product
9089,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,No emotion toward brand or product


In [None]:
df1.info()

**Extracting product/brand from column**

In [None]:
pattern = r"(Apple)|(Google)|(iPad)|(App)"

In [None]:
for i in df1['tweet_text']:
    print(i)

**For now I won't dig into the 'I can't tell' rows at this time. Let's make them Neutral for now**

# Change in df

In [8]:
df_2 = df_1.copy()

In [9]:
df_2 = df_2.replace({'No emotion toward brand or product':'neutral'})

In [10]:
df_2 = df_2.replace({"I can't tell":'neutral'})

In [11]:
df_2 = df_2.replace({'Positive emotion':'positive'})

In [12]:
df_2 = df_2.replace({'Negative emotion':'negative'})

In [13]:
df_2.target.value_counts()

neutral     5545
positive    2978
negative     570
Name: target, dtype: int64

# Preprocess Data

Things to clean:
* Remove urls
* Remove html tags
* Remove numbers
* Remove punctuation/capitalization
* Remove mentions
* Remove hashtags
* Remove stopwords
* Stem words


In [14]:
import numpy as np
np.random.seed(0)
import nltk
from nltk import FreqDist, word_tokenize
from nltk.corpus import stopwords
import string
import re

# nltk.download('punkt')
# nltk.download('stopwords')

**Remove urls**

In [20]:
text = df_2['tweet_text'][14]
text

'Great #sxsw ipad app from @madebymany: http://tinyurl.com/4nqv92l'

In [21]:
# text = re.sub(r"http\S+", "", text)
text = re.sub(r'https?:\/\/\S*', '', text)
text

'Great #sxsw ipad app from @madebymany: '

* r - python raw string notation
* ? - https will match with either http or https
* \/\/ - with match with any http:// and https://
* \S - match where the string does not contain a white space
* '*' - zero or more occurrences

**Remove html tags**

In [35]:
text_2 = df_1['tweet_text'][32:35]
text_2

32    Attn: All  #SXSW frineds, @mention Register fo...
33        Anyone at  #sxsw want to sell their old iPad?
34    Anyone at  #SXSW who bought the new iPad want ...
Name: tweet_text, dtype: object

In [47]:
text_2 = df_1['tweet_text'].str.count(r"({link})").sum()
text_2

4292.0

In [None]:
text_2a = df_1['tweet_text'].str.count(r"({link})").sum()

In [50]:
text_2a = df_1['tweet_text'].str.replace(r"({link})", '')

In [53]:
text_2a[32]

'Attn: All  #SXSW frineds, @mention Register for #GDGTLive  and see Cobra iRadar for Android. '

In [54]:
text_3 = df_1['tweet_text'][43]
text_3

'Mashable! - The iPad 2 Takes Over SXSW [VIDEO] #ipad #sxsw #gadgets {link}'

**Remove hashtags and mentions**

In [60]:
text_4 = df_1['tweet_text'][0]
text_4

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [62]:
# d matches all digits and . matches any one character behind the digits
text_4 = re.sub(r"\s?([@#][\w_-]+)", "", text_4)
text_4
# @[A-Za-z0-9_]+"

'. I have a 3G iPhone. After 3 hrs tweeting at, it was dead!  I need to upgrade. Plugin stations at.'

In [None]:
# "\s?([#]+)" just removes hashtag symbol
# ask abhi if I should get rid of the entire hashtags

**Remove punctuation/capitalization**

In [78]:
# d matches all digits and . matches any one character behind the digits
text_5 = re.sub(r"[,\'?\.!$%_]", "", text_4)
text_5.lower()

' i have a 3g iphone after 3 hrs tweeting at it was dead  i need to upgrade plugin stations at'

**Remove numbers**

In [77]:
text_6 = re.sub(r"\d.", "", text_5)
text_6

' I have a  iPhone After hrs tweeting at it was dead  I need to upgrade Plugin stations at'

**Expand contractions**

In [None]:
link_p = "({link})"
matches = re.extract(link_p, text_3)
matches

In [None]:
len(matches)

In [None]:
print(str(df_1['tweet_text']))

In [88]:
# possible function to fully clean tweets
def clean_text(row, df):

    if df['lowercase']:
        row = row.lower()

#     if df['remove_url']:
#         txt = re.sub(r'https?:\/\/\S*', '')
#         row = txt.get_text()

    if df['remove_html']:
        row = re.sub(r"({link})", '', str(row))

    if df['remove_mentions']:
        row = re.sub('@[A-Za-z0-9]+', '', str(row))
    
    if df['remove_numbers']:
        row = re.sub(r"\d.", "", str(row))

    return row

# clean_config = {
# #     'remove_url': True,
#     'remove_mentions': True,
#     'remove_html': True,
#     'lowercase': True,
#     'remove_numbers': True
#     }


In [93]:
df_2['tweet_text'] = df_2['tweet_text'].apply(clean_text, args=(clean_config,))

AttributeError: 'float' object has no attribute 'lower'

**Remove stopwords**

**Lemmization**

In [None]:
# patterns for cleaning
pattern = "([a-zA-Z]+(?:'[a-z]+)?)" #for apostrophe
pattern1 = ""

In [None]:
data = df_1['tweet_text'].map(word_tokenize)
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
data = nltk.regexp_tokenize(df_1['tweet_text'], pattern)
data[:100]

In [None]:
df_1['tweet_text'][9090]

In [None]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)

In [None]:
# pattern = '\d'
# p = re.compile(pattern)
# digits = p.findall(str(df['tweet_text']))
# digits

In [None]:
stopped_tokens = [str(w).lower() for w in data if w not in stopwords_list]

In [None]:
stopped_tokens

In [None]:
tags_remove = re.compile(r'<[^>]+>')

In [None]:
# pattern = '\d'
# p = re.compile(pattern)
# digits = p.findall(str(df['tweet_text']))
# digits