In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='latin1')

In [3]:
df1 = df.drop('emotion_in_tweet_is_directed_at', axis=1)

In [4]:
df1.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product':'target'}, 
          inplace=True)
df1.head()

Unnamed: 0,tweet_text,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [5]:
import numpy as np
np.random.seed(0)
import nltk
from nltk import FreqDist, word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import *
import string
import re
from nltk.stem import WordNetLemmatizer 
# nltk.download('punkt')
# nltk.download('stopwords')

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams
import seaborn as sns

In [7]:
df1['tweet_text'] = df1.tweet_text.astype(str)

In [8]:
df1['clean_tweet'] = df1['tweet_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df1['clean_tweet'].head()

0    .@wesley83 i have a 3g iphone. after 3 hrs twe...
1    @jessedee know about @fludapp ? awesome ipad/i...
2    @swonderlin can not wait for #ipad 2 also. the...
3    @sxsw i hope this year's festival isn't as cra...
4    @sxtxstate great stuff on fri #sxsw: marissa m...
Name: clean_tweet, dtype: object

In [9]:
def remove_urls(corpus):
#     pattern = re.compile(r'https?:\/\/\S*')
    pattern = re.compile(r"(https?:\/\/)?(www\.)[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|(https?:\/\/)?(www\.)?(?!ww)[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)") 
    # captures six different groups of websites
    return pattern.sub(r'', corpus)

df1['clean_tweet'] = df1['clean_tweet'].apply(lambda x:remove_urls(x))

In [10]:
# df1['tweet_text'] = df1['tweet_text'].str.replace(r"({link})", '')

In [11]:
def remove_link(corpus):
    pattern = re.compile(r"({link})")
    return pattern.sub(r'', corpus)

df1['clean_tweet'] = df1['clean_tweet'].apply(lambda x:remove_urls(x))

In [12]:
def remove_hashtags(corpus):
    pattern = re.compile(r"\s?([@#][\w_-]+)")
    return pattern.sub(r'', corpus)

df1['clean_tweet'] = df1['clean_tweet'].apply(lambda x:remove_hashtags(x))

In [13]:
def remove_punct(corpus):
    pattern = re.compile(r"[\|\^\+\[\]\(\),~\'?\.\/{}=!$%&:;_-]") # Match any character in set
    return pattern.sub(r'', corpus)

df1['clean_tweet'] = df1['clean_tweet'].apply(lambda x:remove_punct(x))

In [14]:
def remove_numbers(corpus):
    pattern = re.compile(r"\d")
    return pattern.sub(r'', corpus)

df1['clean_tweet'] = df1['clean_tweet'].apply(lambda x: remove_numbers(x))

In [15]:
stopwords_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# remove stopwords, tokenize and lemmatize the text
# def preprocess(text, lemma=False):
#     text = str(text).lower().strip()
#     tokens = []
#     for token in text.split():
#         if token not in stopwords_list:
#             if lemma:
#                 tokens.append(lemmatizer.lemmatize(token))
#             else:
#                 tokens.append(token)
#     return " ".join(tokens)

# df1['tweet_text']=df1['tweet_text'].apply(lambda x:preprocess(x))

In [16]:
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [17]:
df1['char_count'] = df1['tweet_text'].str.len() #how many characters do we have in description? 
print(df1[['tweet_text','char_count']].head())
print(df1['char_count'].mean())

                                          tweet_text  char_count
0  .@wesley83 I have a 3G iPhone. After 3 hrs twe...         127
1  @jessedee Know about @fludapp ? Awesome iPad/i...         139
2  @swonderlin Can not wait for #iPad 2 also. The...          79
3  @sxsw I hope this year's festival isn't as cra...          82
4  @sxtxstate great stuff on Fri #SXSW: Marissa M...         131
104.95106125591114


In [18]:
df1['stopwords'] = df1['tweet_text'].apply(lambda x: len([x for x in x.split() if x in stopwords_list]))
df1[['tweet_text','stopwords']].head(10)

Unnamed: 0,tweet_text,stopwords
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,7
1,@jessedee Know about @fludapp ? Awesome iPad/i...,6
2,@swonderlin Can not wait for #iPad 2 also. The...,6
3,@sxsw I hope this year's festival isn't as cra...,5
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,1
5,@teachntech00 New iPad Apps For #SpeechTherapy...,0
6,,0
7,"#SXSW is just starting, #CTIA is around the co...",14
8,Beautifully smart and simple idea RT @madebyma...,4
9,Counting down the days to #sxsw plus strong Ca...,5


In [19]:
# remove stopwords
df1['clean_tweet'] = df1['clean_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords_list))
df1['clean_tweet'].head()

0    g iphone hrs tweeting dead need upgrade plugin...
1    know awesome ipadiphone app youll likely appre...
2                                       wait also sale
3     hope years festival isnt crashy years iphone app
4    great stuff fri marissa mayer google tim oreil...
Name: clean_tweet, dtype: object

In [None]:
joined_tweet = ' '.join(df1['tweet_text'].tolist())
joined_tweet

In [None]:
bigram_measure = nltk.collocations.BigramAssocMeasures()

In [None]:
tweet_finder = BigramCollocationFinder.from_words(tweet)

In [None]:
# def get_tweet_bigram(corpus,n):
#     vec=CountVectorizer(ngram_range=(2,2))
#     bow=vec.fit_transform(corpus)
#     sum_word=bow.sum(axis=0)
#     word_freq=[(x,sum_word[0,i]) for x,i in vec.vocabulary_.items()]
#     word_freq=sorted(word_freq,key=lambda x:x[1],reverse=True)
#     return word_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_bigram=get_tweet_bigram(df1['tweet_text'],10)
x,y=map(list,zip(*top_tweet_bigram))
sns.barplot(y,x)

In [None]:
df1['tweet_text']

In [None]:
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('googles')