### From database of companies, pick 3 different retailers to use for testing purposes

In [1]:
#load data from mongodb's collection of tweets_by_companies into pandas. Run mongod to start mongodb
import pandas as pd
import pymongo
from pymongo import MongoClient
mc = MongoClient()
db = mc.twitter_db
input_data = db.tweets_by_companies
data = pd.DataFrame(list(input_data.find()))


In [2]:
data['company'] = data['user'].apply(lambda x: x['screen_name'].encode('ascii'))

In [3]:
data['company'].value_counts()

eBay               6496
ChipotleTweets     6259
netflix            5342
hm                 3738
SUBWAY             3586
AppStore           3579
Nike               3560
BedBathBeyond      3442
Disney             3343
Sephora            3263
pizzahut           3251
OldNavy            3250
Staples            3249
IHOP               3248
Walgreens          3247
nikestore          3247
lushcosmetics      3245
Sears              3245
Fandango           3245
HomeDepot          3244
Target             3242
Macys              3241
Gap                3241
Forever21          3241
QVC                3237
Walmart            3236
jcpenney           3234
tjmaxx             3234
Lowes              3233
Ticketmaster       3230
                   ... 
PetSmart           3224
Kohls              3223
zappos             3223
DunkinDonuts       3218
IKEAUSA            3216
bathbodyworks      3214
hotelsdotcom       3213
UrbanOutfitters    3213
CVS_Extra          3212
Nordstrom          3212
BNBuzz          

### filter data to only show company name and tweet. Also drop any duplicate tweets

In [4]:
important_columns = ['company', 'text']
company_tweets = data.loc[: , important_columns]
company_tweets = company_tweets.drop_duplicates()
company_tweets['company'].value_counts()

BedBathBeyond      3419
Disney             3341
AppStore           3259
Sephora            3257
OldNavy            3250
Staples            3249
pizzahut           3247
Walgreens          3246
eBay               3246
lushcosmetics      3245
IHOP               3245
netflix            3242
Target             3242
Sears              3241
Macys              3241
Gap                3241
Nike               3240
Fandango           3240
Forever21          3240
nikestore          3238
HomeDepot          3234
Walmart            3233
Lowes              3231
BestBuy            3228
SUBWAY             3228
lululemon          3225
BananaRepublic     3225
ChickfilA          3224
QVC                3224
zappos             3223
                   ... 
bathbodyworks      3214
UrbanOutfitters    3213
Kohls              3212
Nordstrom          3212
CVS_Extra          3212
tjmaxx             3210
DunkinDonuts       3208
BNBuzz             3208
ContainerStore     3207
Cinemark           3207
hotelsdotcom    

In [3]:
ls

README.md  [34mdata[m[m/      [34mtwitter[m[m/


In [4]:
import pandas as pd
import cPickle as pickle
company_tweets = pd.read_pickle('data/company_tweets.pkl')

In [5]:
mini_retailers_df = company_tweets[(company_tweets['company'] == 'Disney') | (company_tweets['company'] == 'Sephora') | (company_tweets['company'] == 'HomeDepot')]
mini_retailers_df

Unnamed: 0,company,text
31131,Disney,Start your day with BB-8. Full recipe here: ht...
31132,Disney,On a roll. #TheForceAwakens https://t.co/aiLFS...
31133,Disney,Ready the lightsabers. @StarWars: #TheForceAwa...
31134,Disney,Get lost inside The #JungleBook.\nhttps://t.co...
31135,Disney,"""I do not like the cone of shame."" https://t.c..."
31136,Disney,RT @starwars: Incoming transmission from Londo...
31137,Disney,RT @DisneyStudios: See a new extended version ...
31138,Disney,Zzz... https://t.co/RbZoTP8SPK
31139,Disney,The Force is strong with John Boyega. #TheForc...
31140,Disney,"""It's my birthday gift to me. I'm so happy."" h..."


### Stemming each tweet

In [6]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

snowball = SnowballStemmer('english')

### Python 2 unicode values, translate() only takes a dictionary

In [7]:
translation_table = dict.fromkeys(map(ord, ')(][:.",!#&;$?'), None)
mini_retailers_df['stemmed_text'] = mini_retailers_df['text'].apply(lambda sent: ' '.join([snowball.stem(word) for \
                                    word in sent.translate(translation_table).split()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [8]:
mini_retailers_df

Unnamed: 0,company,text,stemmed_text
31131,Disney,Start your day with BB-8. Full recipe here: ht...,start your day with bb-8 full recip here https...
31132,Disney,On a roll. #TheForceAwakens https://t.co/aiLFS...,on a roll theforceawaken https//tco/ailfsrvs6u
31133,Disney,Ready the lightsabers. @StarWars: #TheForceAwa...,readi the lightsab @starwar theforceawaken is ...
31134,Disney,Get lost inside The #JungleBook.\nhttps://t.co...,get lost insid the junglebook https//tco/aiyeb...
31135,Disney,"""I do not like the cone of shame."" https://t.c...",i do not like the cone of shame https//tco/wwu...
31136,Disney,RT @starwars: Incoming transmission from Londo...,rt @starwar incom transmiss from london join u...
31137,Disney,RT @DisneyStudios: See a new extended version ...,rt @disneystudio see a new extend version of t...
31138,Disney,Zzz... https://t.co/RbZoTP8SPK,zzz https//tco/rbzotp8spk
31139,Disney,The Force is strong with John Boyega. #TheForc...,the forc is strong with john boyega theforceaw...
31140,Disney,"""It's my birthday gift to me. I'm so happy."" h...",it my birthday gift to me i'm so happi https//...


### Import sklearn's TfidfVectorizer and create a vector for each company

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict

In [10]:
company_list = mini_retailers_df['company'].value_counts().index
company_list

Index([u'Disney', u'Sephora', u'HomeDepot'], dtype='object')

In [11]:
tfidf = TfidfVectorizer(stop_words='english')

In [12]:
vects = defaultdict(list)
for company in company_list:
    documents = mini_retailers_df[mini_retailers_df['company'] == company].stemmed_text
    vect_name = 'tfidf_'+company
    print documents.values[0]
    vects[vect_name] = tfidf.fit_transform(documents)

start your day with bb-8 full recip here https//tco/ccwo1nkpop theforceawaken https//tco/fs8u3zb8td
@mirahoward all item on our canadian site should be abl to be ship to canada which product did you tri
make sure your car is road trip readi befor you hit the road this holiday see how https//tco/d0fzwmbezf bfs https//tco/mxrcrkriw5


In [13]:
vects['tfidf_Disney']

<3341x7540 sparse matrix of type '<type 'numpy.float64'>'
	with 28666 stored elements in Compressed Sparse Row format>

### Take a shard of users and their tweets 

In [9]:
open('~/tweets_user.json')

IOError: [Errno 2] No such file or directory: '~/tweets_user.json'

In [8]:
import json
tweets = json.load(open('~/tweets_user.json'))

IOError: [Errno 2] No such file or directory: '~/tweets_user.json'

In [None]:
#Take shards of 10000 from file

from itertools import izip_longest

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
    args = [iter(iterable)] * n
    return izip_longest(fillvalue=fillvalue, *args)

for i, group in enumerate(grouper(input_tweets, 10000)):
    with open('userbatch_{}.json'.format(i), 'w') as outputfile:
        json.dump(list(group), outputfile)

In [2]:
# input_data = db.tweets_by_users
import pandas as pd
users = pd.read_csv('~/tweets_user.csv', nrows=25000, error_bad_lines=False)

Skipping line 4: expected 117 fields, saw 154
Skipping line 8: expected 117 fields, saw 150
Skipping line 13: expected 117 fields, saw 152
Skipping line 14: expected 117 fields, saw 149
Skipping line 20: expected 117 fields, saw 161
Skipping line 23: expected 117 fields, saw 138
Skipping line 46: expected 117 fields, saw 138
Skipping line 55: expected 117 fields, saw 145
Skipping line 62: expected 117 fields, saw 157
Skipping line 78: expected 117 fields, saw 125
Skipping line 95: expected 117 fields, saw 253
Skipping line 114: expected 117 fields, saw 143
Skipping line 127: expected 117 fields, saw 216
Skipping line 131: expected 117 fields, saw 144
Skipping line 136: expected 117 fields, saw 245
Skipping line 140: expected 117 fields, saw 139
Skipping line 141: expected 117 fields, saw 237
Skipping line 143: expected 117 fields, saw 154
Skipping line 156: expected 117 fields, saw 143
Skipping line 157: expected 117 fields, saw 138
Skipping line 165: expected 117 fields, saw 144
Skipp

In [3]:
users.head()

Unnamed: 0,"{ ""_id"" : { ""$oid"" : ""567494cd60327009c32227c2"" }","""contributors"" : null","""truncated"" : false","""text"" : ""@Cheesecake for dinner 💯 https://t.co/Ce3hzP8F0s""","""is_quote_status"" : false","""in_reply_to_status_id"" : null","""id"" : 677976514831499264","""favorite_count"" : 13","""source"" : ""<a href=\""http://twitter.com/download/iphone\"" rel=\""nofollow\"">Twitter for iPhone</a>""","""retweeted"" : false",...,"""resize"" : ""fit"".5","""w"" : 600 }.1","""thumb"" : { ""h"" : 150.1","""resize"" : ""crop"".1","""w"" : 150 } }.1","""indices"" : [ 25.1",48 ].1,"""type"" : ""photo"".1","""id"" : 677976504031035393.1","""media_url"" : ""http://pbs.twimg.com/media/CWioCdDUwAEjpum.jpg"" } ] } }"
0,"{ ""_id"" : { ""$oid"" : ""567494cd60327009c32227c3"" }","""contributors"" : null","""truncated"" : false","""text"" : ""@thetenspot awesome!!!""","""is_quote_status"" : false","""in_reply_to_status_id"" : 677950676618596352","""id"" : 677975726956617728","""favorite_count"" : 0","""source"" : ""<a href=\""http://twitter.com/down...","""retweeted"" : false",...,,,,,,,,,,
1,"{ ""_id"" : { ""$oid"" : ""567494cd60327009c32227c4"" }","""contributors"" : null","""truncated"" : false","""text"" : ""@stajans_girl @pkumaaar_ LMAO YESSSSS""","""is_quote_status"" : false","""in_reply_to_status_id"" : 677933423894372356","""id"" : 677934558076121088","""favorite_count"" : 2","""source"" : ""<a href=\""http://twitter.com/down...","""retweeted"" : false",...,,,,,,,,,,
2,"{ ""_id"" : { ""$oid"" : ""567494cd60327009c32227c6"" }","""contributors"" : null","""truncated"" : false","""text"" : ""@stajans_girl I bought Pooj chocola...","""is_quote_status"" : false","""in_reply_to_status_id"" : 677929295596134400","""id"" : 677929425212735488","""favorite_count"" : 0","""source"" : ""<a href=\""http://twitter.com/down...","""retweeted"" : false",...,,,,,,,,,,
3,"{ ""_id"" : { ""$oid"" : ""567494cd60327009c32227c7"" }","""contributors"" : null","""truncated"" : false","""text"" : ""@stajans_girl YO I need to see you ...","""is_quote_status"" : false","""in_reply_to_status_id"" : null","""id"" : 677929173416091648","""favorite_count"" : 0","""source"" : ""<a href=\""http://twitter.com/down...","""retweeted"" : false",...,,,,,,,,,,
4,"{ ""_id"" : { ""$oid"" : ""567494cd60327009c32227c8"" }","""contributors"" : null","""truncated"" : false","""text"" : ""Mulberry Cafe has the cutest tree!!...","""is_quote_status"" : false","""in_reply_to_status_id"" : null","""id"" : 677928806229876737","""favorite_count"" : 0","""source"" : ""<a href=\""http://twitter.com/down...","""retweeted"" : false",...,"""indices"" : [ 36",59 ],"""type"" : ""photo""","""id"" : 677928796419395585","""media_url"" : ""http://pbs.twimg.com/media/CWh...",,,,,


In [14]:
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [15]:
sentiment = SentimentAnalyzer()

In [16]:
sid = SentimentIntensityAnalyzer()
score = sid.polarity_scores('It was a decent movie')


### Use Cosine Similarity to compare documents of user tweets to company tweets

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
openssl req -x509 -nodes -days 365 -newkey rsa:1024 -keyout mycert.pem -out mycert.pem