# Business Problem

**Build a model that can rate the sentiment of a Tweet based on its content.**

# Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='latin1')

In [3]:
df.head(50)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


**Don't need the product column at this time since we're focused on the sentiment of the tweet and not specifically the product**

In [4]:
df_1 = df.drop('emotion_in_tweet_is_directed_at', axis=1)

In [5]:
df_1.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product':'emotion'}, 
          inplace=True)
df_1.head()

Unnamed: 0,tweet_text,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


# Preprocess Data

**A quick look at different examples of emotions**

In [6]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  9092 non-null   object
 1   emotion     9093 non-null   object
dtypes: object(2)
memory usage: 142.2+ KB


In [7]:
df_1['emotion'].value_counts(normalize=True)

No emotion toward brand or product    0.592654
Positive emotion                      0.327505
Negative emotion                      0.062686
I can't tell                          0.017156
Name: emotion, dtype: float64

In [8]:
import numpy as np
np.random.seed(0)
import nltk
from nltk import FreqDist, word_tokenize
from nltk.corpus import stopwords
import string
import re

# nltk.download('punkt')
# nltk.download('stopwords')

In [9]:
df_1['tweet_text'] = df_1['tweet_text'].astype(str)

In [10]:
df_1['tweet_text']

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...
Name: tweet_text, Length: 9093, dtype: object

In [None]:
# data = df_1['tweet_text'].map(word_tokenize)
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
data = df_1
# data[:100]

In [16]:
df_1['tweet_text'][9090]

"Google's Zeiger, a physician never reported potential AE. Yet FDA relies on physicians. &quot;We're operating w/out data.&quot; #sxsw #health2dev"

In [None]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)

In [None]:
# pattern = '\d'
# p = re.compile(pattern)
# digits = p.findall(str(df['tweet_text']))
# digits

In [None]:
stopped_tokens = [str(w).lower() for w in data if w not in stopwords_list]

In [None]:
stopped_tokens

In [None]:
tags_remove = re.compile(r'<[^>]+>')

In [None]:
# pattern = '\d'
# p = re.compile(pattern)
# digits = p.findall(str(df['tweet_text']))
# digits