# Stakeholder

Apple as stakeholder and compare sentiment of its release vs. Google's at SXSW

# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import re

df = pd.read_csv('../data/tweets.csv', encoding = 'iso-8859-1')
sw = stopwords.words('english')

In [2]:
#Gets rid of company column
tweets = df.drop('emotion_in_tweet_is_directed_at', axis = 1)

In [3]:
tweets['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [4]:
#Drops rows with unknown sentiment
tweets = tweets[tweets['is_there_an_emotion_directed_at_a_brand_or_product'] != 'I can\'t tell']

In [5]:
#Renames column with sentiments as label and drops single nan row
tweets['label'] = tweets['is_there_an_emotion_directed_at_a_brand_or_product']
tweets = tweets.drop('is_there_an_emotion_directed_at_a_brand_or_product', axis = 1)
tweets = tweets.dropna()

In [6]:
tweets.label.value_counts(normalize = True)

No emotion toward brand or product    0.602954
Positive emotion                      0.333259
Negative emotion                      0.063787
Name: label, dtype: float64

In [7]:
#Reassigns label
tweets.label = tweets.label.map({'Negative emotion' : 0, 'Positive emotion': 1, 
                                 'No emotion toward brand or product': 2})

tweets.label.value_counts(normalize = True)

2    0.602954
1    0.333259
0    0.063787
Name: label, dtype: float64

In [8]:
#Functions to tokenize text
import string

def pos_replace(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
punct = list(string.punctuation)
#Keep periods for ellipses? They're the most common "word" though
#Keep numbers or get rid of them?
keep_punct = ['#', '?', '!', ',', '@']
punct = [p for p in punct if p not in keep_punct]
    
def tweet_tokenizer(doc, stop_words = sw):
    tweet_token = TweetTokenizer(strip_handles = True)
    doc = tweet_token.tokenize(doc)
    doc = [w for w in doc if w not in sw]
    doc = [w for w in doc if w not in punct]
    doc = pos_tag(doc)
    doc = [(w[0], pos_replace(w[1])) for w in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
    return doc

# Data Exploration

In [9]:
from nltk.probability import FreqDist

all_words = []
for tweet in tweets['tweet_text']:
    all_words.extend(tweet_tokenizer(tweet))


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\Theodore Brandon/nltk_data'
    - 'F:\\Flatiron\\Anaconda2\\envs\\learn-env\\nltk_data'
    - 'F:\\Flatiron\\Anaconda2\\envs\\learn-env\\share\\nltk_data'
    - 'F:\\Flatiron\\Anaconda2\\envs\\learn-env\\lib\\nltk_data'
    - 'C:\\Users\\Theodore Brandon\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
fdist = FreqDist()
for word in all_words:
    fdist[word] += 1
fdist.plot(20, title = 'Frequency of Top 20 Words in Tweets')
plt.show()

# Get rid of bad data

In [None]:
tweets['word_count'] = tweets['tweet_text'].apply(word_count)
tweets['word_count'].describe()

In [None]:
tweets[tweets['word_count'] == 2]

In [None]:
tweets = tweets[tweets['word_count'] != 2]

# Feature Creation

In [None]:
#Look for excessive puntuation
def punc_count(tweet):
    punctuations = '!$%&()*+,-./:;<=>?[\]^_`{|}~'
    count = 0
    for p in punctuations:
        count += tweet.count(p)
    return count

#Flag if at-ed the company

#Ratio capital to length tweet
def capital_letter_ratio(tweet):
    capital_count = 0
    for c in tweet:
        if c.isupper():
            capital_count += 1
    return capital_count / len(tweet)

#Repeating words
def any_repeats(tweet):
    if len(set(tweet.split())) < len(tweet.split()):
        return 1
    else:
        return 0 
    
#Hashtag count
def count_hash(tweet):
    hashtag = re.findall(r'(#w[A-Za-z0-9]*)', tweet)
    return len(hashtag)

#Average word length
def avg_length(tweet):
    char = len(tweet)
    word = len(tweet.split())
    return char / word

#Number of words
def word_count(tweet):
    return len(tweet.split())

In [None]:
tweets['punc_count'] = tweets['tweet_text'].apply(punc_count)
tweets['avg_word_len'] = tweets['tweet_text'].apply(avg_length)

In [None]:
tweets.avg_word_len.value_counts()

# Different Vectorizers

In [None]:
#Train test split
y = tweets['label']
X = tweets.drop('label', axis = 1)
col_labels = list(X_train.columns)
col_labels.remove('tweet_text')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 213)

### CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(encoding = 'iso-8859-1', lowercase = False, tokenizer = tweet_tokenizer)
X_train_cv = cv.fit_transform(X_train['tweet_text'])

#SCALE NON-VECTORIZED COLUMNS

X_train_cv_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names(), index = X_train.index)
X_train_final = pd.concat([X_train_cv_df, X_train[col_labels]], axis=1)

### TF_IDF Vec

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(encoding = 'iso-8859-1', lowercase = False, tokenizer = tweet_tokenizer)
X_train_tfidf = tfidf.fit_transform(X_train['tweet_text'])

#SCALE NON-VECTORIZED COLUMNS

X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns = tfidf.get_feature_names(), index = X_train.index)
X_train_final = pd.concat([X_train_tfidf_df, X_train[col_labels]], axis=1)

# Models

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

first_pass = MultinomialNB()
first_pass.fit(X_train_final, y_train)
print("Training Score:", first_pass.score(X_train_final, y_train))
scores = np.mean(cross_val_score(first_pass, X_train_final, y_train, cv=5))
print("Validation Score:" + str(scores))

# IGNORE/TESTING AREA

## IF WANT TRY PIPELINE/ADD FEATURE THINGY

In [None]:
"""
from sklearn.pipeline import FeatureUnion, Pipeline
count_vec = FeatureUnion([
        ('cv', CountVectorizer(encoding = 'iso-8859-1', lowercase = False, tokenizer = tweet_tokenizer))
        #, add any feature creation things here
    ])

tfidf_vec = FeatureUnion([
        ('tfidf', TfidfVectorizer(encoding = 'iso-8859-1', lowercase = False, tokenizer = tweet_tokenizer))
        #, add any feature creation things here
    ])

first_pass = Pipeline(steps=[
    ('vec', count_vec),
    ('mnb', MultinomialNB())
])
"""

In [None]:
def punc_count(tweet):
    punctuations = '!$%&()*+,-./:;<=>?[\]^_`{|}~'
    count = 0
    for p in punctuations:
        count += tweet.count(p)
    return count

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class PuncCount(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return data.apply(punc_count)

In [None]:
y = tweets['label']
X = tweets['tweet_text']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 213)

In [None]:
from sklearn.pipeline import FeatureUnion, Pipeline
count_vec = FeatureUnion([
        ('punc', PuncCount()),
        ('cv', CountVectorizer(encoding = 'iso-8859-1', lowercase = False, tokenizer = tweet_tokenizer))
    ])

first_pass = Pipeline(steps=[
    ('vec', count_vec),
    ('mnb', MultinomialNB())
])

first_pass.fit(X_train, y_train)
print("Training Score:", first_pass.score(X_train, y_train))
scores = np.mean(cross_val_score(first_pass, X_train, y_train, cv=5))
print("Validation Score:" + str(scores))

In [None]:
pc = PuncCount()
pc.fit(X_train)
test = pc.transform(X_train)

# Getting only Apple Tweets

In [None]:
df['emotion_in_tweet_is_directed_at'].value_counts()

In [None]:
no_labels = df[df['emotion_in_tweet_is_directed_at'].isna()]
no_labels = no_labels.dropna(subset = ['tweet_text'])

In [None]:
tweet_list = []
for tweet in no_labels['tweet_text']:
    tweet_check = tweet.lower()
    if ('iphone' in tweet_check) or ('ipad' in tweet_check) or ('apple' in tweet_check):
        tweet_list.append(tweet)

In [None]:
ipad = df[df['emotion_in_tweet_is_directed_at'] == 'iPad']
apple = df[df['emotion_in_tweet_is_directed_at'] == 'Apple']
mix = df[df['emotion_in_tweet_is_directed_at'] == 'iPad or iPhone App']
iphone = df[df['emotion_in_tweet_is_directed_at'] == 'iPhone']
apps = df[df['emotion_in_tweet_is_directed_at'] == 'Other Apple product or service']

unlabeled_apple = df[df['tweet_text'].isin(tweet_list)]
unlabeled_apple = unlabeled_apple.drop_duplicates(subset = 'tweet_text')

final_df = pd.concat([ipad, apple, mix, iphone, apps, unlabeled_apple], axis = 0)

final_df.head()

In [None]:
final_df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts(normalize = True)

In [None]:
len(final_df)

In [None]:
len(tweets) - len(final_df)