## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [1]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Standard Packages
import pandas as pd
import numpy as np
import string
import ssl
import re

# Statsmodels
import statsmodels.api as sm

# NLTK
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, TweetTokenizer         # Tweet Tokenizer!
from nltk import FreqDist
from nltk.corpus import stopwords
import nltk
# nltk.download('wordnet')

# Scikit-Learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
df = pd.read_csv("data/tweets.csv")
df.columns = ['text', 'device', 'emotion']

In [3]:
df.head()

Unnamed: 0,text,device,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [5]:
### Deliniating between Google and Apple

google_tweets = ['Google', 'Other Google product or service', 'Andriod App', 'Andriod']
apple_tweets = ['Apple', 'Other Apple product or service', 'Apple App', 'iPhone', 'iPad', 'iPad or iPhone App']

### Creating a new column for google vs. apple vs. unknown

df['device_type'] = np.where(df['device'].isin(google_tweets), 'Google', np.where(df['device'].isin(apple_tweets), 'Apple', 'Unknown'))

### Dropping 'I can't tell' and 'Other' rows

# df = df[df['emotion'] != "I can't tell"]

### Dropping blank 'text' rows

df = df.dropna(subset=['text'])


In [13]:
pd.set_option('display.max_colwidth', None)

In [33]:
df[df.text.str.contains('#')].shape

(9086, 4)

In [34]:
df[~df.text.str.lower().str.contains('sxsw')]

Unnamed: 0,text,device,emotion,device_type
770,"Google to Launch Major New Social Network Called Circles, Possibly Today {link}",Google,Positive emotion,Google
794,"Google to Launch Major New Social Network Called Circles, Possibly Today @mention {link} via @mention",,No emotion toward brand or product,Unknown
1142,‰ÛÏ@mention The 10 most dangerous IPhone apps. {link} @mention,,No emotion toward brand or product,Unknown
2431,Extra iPad 2 for sale in Austin convention center who wants one? {link},,No emotion toward brand or product,Unknown
2713,Get in the season with Clover Touch:\r{link} #Games #STPATRICK,,No emotion toward brand or product,Unknown
3482,GENIUS behind Google Circles.\r¡á¾_Î¾Ð¡__Š___ÔÈÏ_ãŒöÝÇÜŠ¼¼\rReal Life Social Graph Network v2 \rView documents by Paul Adams {link},Other Google product or service,Positive emotion,Google
4920,"Download of the Day: Lonely Planet Austin, Free For a Limited Time {link} #iPhone",,No emotion toward brand or product,Unknown
5025,RT @mention,,No emotion toward brand or product,Unknown


In [35]:
df.emotion.value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [20]:
df.shape

(9092, 4)

In [25]:
r = r'[^\w\s](?:\s*#[a-zA-Z]+)+(?:\s*[^\w\s])?'
r2 = r'#.*?(?=\s|$)'
df.text.str.findall(r2)

0          [#RISE_Austin,, #SXSW.]
1                          [#SXSW]
2                  [#iPad, #SXSW.]
3                          [#sxsw]
4                         [#SXSW:]
                   ...            
9088                       [#SXSW]
9089    [#sxsw, #google, #circles]
9090          [#sxsw, #health2dev]
9091                      [#SXSW.]
9092                       [#SXSW]
Name: text, Length: 9092, dtype: object

In [15]:
df[df.device_type == 'Unknown']

Unnamed: 0,text,device,emotion,device_type
5,@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd,,No emotion toward brand or product,Unknown
7,"#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan",Android,Positive emotion,Unknown
10,Excited to meet the @samsungmobileus at #sxsw so I can show them my Sprint Galaxy S still running Android 2.1. #fail,Android,Positive emotion,Unknown
11,Find &amp; Start Impromptu Parties at #SXSW With @HurricaneParty http://bit.ly/gVLrIn I can't wait til the Android app comes out.,Android App,Positive emotion,Unknown
12,"Foursquare ups the game, just in time for #SXSW http://j.mp/grN7pK) - Still prefer @Gowalla by far, best looking Android app to date.",Android App,Positive emotion,Unknown
...,...,...,...,...
9087,"@mention Yup, but I don't have a third app yet. I'm on Android, any suggestions? #SXSW CC: @mention",,No emotion toward brand or product,Unknown
9089,"Wave, buzz... RT @mention We interrupt your regularly scheduled #sxsw geek programming with big news {link} #google #circles",,No emotion toward brand or product,Unknown
9090,"Google's Zeiger, a physician never reported potential AE. Yet FDA relies on physicians. &quot;We're operating w/out data.&quot; #sxsw #health2dev",,No emotion toward brand or product,Unknown
9091,Some Verizon iPhone customers complained their time fell back an hour this weekend. Of course they were the New Yorkers who attended #SXSW.,,No emotion toward brand or product,Unknown


### Making X and y

In [6]:
X = df.drop('emotion', axis=1)
y = df['emotion']

### Train, Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [10]:
X.device_type.value_counts()

Unknown    5813
Apple      2402
Google      721
Name: device_type, dtype: int64

In [None]:
### Creating a function that removes words that begin with @

def remove_at(text):
    text = text.split()
    text = [word for word in text if not word.startswith('@')]
    text = ' '.join(text)
    return text

### Creating a function that makes all text lowercase

def lower_case(text):
    text = text.lower()
    return text

### Creating a function that removes all punctuation

def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

### Creating a function that removes all stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = text.split()
    text = [word for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

### Creating a function that removes words that contain characters like ‰ÛÏ or ‰ÛÒ or ‰ÛÓ or ‰ÛÒ

def remove_characters(text):
    text = text.split()
    text = [word for word in text if not word.startswith('‰ÛÏ') and not word.startswith('‰ÛÒ') and not word.startswith('‰ÛÓ') and not word.startswith('‰ÛÒ')]
    text = ' '.join(text)
    return text

### Creating a function that lemmatizes words

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

### Creating a function that removes all numbers

def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text

### Creating a function takes the tokenized text and returns a string of words

def tokenize_to_string(text):
    tknzr = TweetTokenizer()
    text = tknzr.tokenize(text)
    text = ' '.join(text)
    return text

### Creating a function that combines all of the above functions

def clean_text(text):
    text = remove_at(text)
    text = lower_case(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = remove_characters(text)
    text = lemmatize(text)
    text = remove_numbers(text)
    text = tokenize_to_string(text)
    return text

In [None]:
### Applying the 'lemmatize_text' function to the 'text' column

X_train['text'] = X_train['text'].apply(lemmatize_text)

In [None]:
apple_terms = ['apple', 'ipad', 'iphone']

google_terms = ['']

In [40]:
df[200:250]

Unnamed: 0,text,device,emotion,device_type
201,"U = smart 4 packing an extra :) RT @mention On my way to sunny Austin for #sxsw! Extra iPhone in stow, just in case. @mention @mention",,No emotion toward brand or product,Unknown
202,"Wonder if 4sqwill attempt to block GoWalla cross-checkins, the way Facebook blocked Google's contacts connections. #sxsw #lbsWars",,No emotion toward brand or product,Unknown
203,#SXSW day 5 at the #apple store and there's still a line...and growing {link},Apple,Positive emotion,Apple
204,Check out 5 Steps to Bulletproof UX Strategy at SXSW\r{link} #SXSW #rhjr_ux5 {link},,No emotion toward brand or product,Unknown
205,"Wandering 6th street with the girls. Stopped for ceviche, mojitos, apple juice &amp; ice cream. #sxsw",,No emotion toward brand or product,Unknown
206,"If you're a band at #SXSW and want to share a track with your Audience from stage, use FrostWire for Android if there's Wi-Fi available",Android App,Positive emotion,Unknown
207,We've got a busy week ahead: Our new book on #Google Apps will be available on Amazon &amp; we're attending #SXSW,,No emotion toward brand or product,Unknown
208,"I believe a few went to Aussies at #sxsw RT @mention iPad 2 Sold Out, 70% Went to New Buyers [REPORT] - {link}",,No emotion toward brand or product,Unknown
209,‰ÛÏ@mention A Google Spreadsheet of over 270 parties at #SXSW this year O_O {link} @mention Plan on going to #SXSW Party on,,No emotion toward brand or product,Unknown
210,"There was a guy in line for Apple's pop-up store on Sixth Street in Austin. &quot;I'm that guy,&quot; he told me. #sxsw",,No emotion toward brand or product,Unknown
