### From database of companies, pick 3 different retailers to use for testing purposes

In [None]:
#load data from mongodb's collection of tweets_by_companies into pandas. Run mongod to start mongodb
import pandas as pd
import pymongo
from pymongo import MongoClient
mc = MongoClient()
db = mc.twitter_db
input_data = db.tweets_by_companies
data = pd.DataFrame(list(input_data.find()))


In [None]:
data['company'] = data['user'].apply(lambda x: x['screen_name'].encode('ascii'))

In [None]:
data['company'].value_counts()

### filter data to only show company name and tweet. Also drop any duplicate tweets

In [None]:
important_columns = ['company', 'text']
company_tweets = data.loc[: , important_columns]
company_tweets = company_tweets.drop_duplicates()
company_tweets['company'].value_counts()

## Start here ------------------------------------------------------------------------- 

In [1]:
import pandas as pd
import cPickle as pickle
company_tweets = pd.read_pickle('../data/company_tweets.pkl')

In [2]:
#mini_retailers_df = company_tweets[(company_tweets['company'] == 'Disney') | (company_tweets['company'] == 'Sephora') | (company_tweets['company'] == 'HomeDepot')]
mini_retailers_df = company_tweets
mini_retailers_df

Unnamed: 0,company,text
0,aeriagames,RT @MMOsdotcom: Latest Dragomon Hunter Update ...
1,aeriagames,RT @DragomonHunter: Swear allegiance to the Fr...
2,aeriagames,We just went live with our new corporate websi...
3,aeriagames,RT @EOS_game: Congratulations @Meruna_ for you...
4,aeriagames,Congratulations @Meruna_ for the awesome @EOS_...
5,aeriagames,"RT @bgfcon: ""Leveraging human buying behaviors..."
6,aeriagames,RT @GIBiz: Aeria Games opens new San Francisco...
7,aeriagames,The hunt is on! Dragomon Hunter's Open Beta is...
8,aeriagames,The closed beta of Dragomon Hunter is now live...
9,aeriagames,RT @GamesMarkt: Verstärkung für das @aeriagame...


### Remove all bit.ly and @ tags from stores' tweets

In [3]:
import re

def remove_links_and_tags(words):
    if 'http' in words or '@' in words:
        return re.sub('(RT)|(RT @[_A-Za-z0-9]+)|(@[_A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', '', words)
    return words 

def remove_links(words):
    if 'http' in words:
        return re.sub('([^0-9A-Za-z \t])|(\w+:\/\/\S+)', '', words)
    return words 

In [4]:
mini_retailers_df['simple_text'] = mini_retailers_df['text'].apply(lambda x: remove_links_and_tags(x))

In [5]:
mini_retailers_df[mini_retailers_df['company'] =='lululemon'].simple_text

185414                              The cuddle game is real
185415                        Wild thing I think I love you
185416                            Howd they keep you toasty
185417                      Which pair is on your wish list
185418                 For the win Which jacket did you get
185419                                      Way to go Coach
185420                                       Oh you know us
185421                                The old baitandswitch
185422     Wow what a hook up But it must be for all the...
185423                      Hard to keep it on the down low
185424                      Sounds like you scored big time
185425         Which pair is giving you those good feelings
185426                             Happy to be on your team
185427                                    Thats how we roll
185428     Its the fuel for your Christmas happiness fue...
185429     Good luck on the race this weekend and enjoy ...
185430                          Hard wor

### Stemming each tweet

In [6]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

snowball = SnowballStemmer('english')

### Python 2 unicode values, translate() only takes a dictionary

In [7]:
translation_table = dict.fromkeys(map(ord, ')(][:.",!#&;$?'), None)
mini_retailers_df['stemmed_text'] = mini_retailers_df['simple_text'].apply(lambda sent: ' '.join([snowball.stem(word) for \
                                    word in sent.translate(translation_table).split()]))

In [8]:
mini_retailers_df[mini_retailers_df['company']=='aeriagames'].values[5]

array(['aeriagames',
       u'RT @bgfcon: "Leveraging human buying behaviors for F2P games success - A devil\'s handbook" by Sebastian Voigt (@aeriagames) https://t.co/4X\u2026',
       u'  Leveraging human buying behaviors for F2P games success  A devils handbook by Sebastian Voigt  ',
       u'leverag human buy behavior for f2p game success a devil handbook by sebastian voigt'], dtype=object)

### Import sklearn's TfidfVectorizer and create a vector for each company

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict

In [10]:
company_list = mini_retailers_df['company'].value_counts().index
company_list

Index([u'BedBathBeyond', u'Disney', u'AppStore', u'Sephora', u'OldNavy',
       u'Staples', u'pizzahut', u'Walgreens', u'eBay', u'lushcosmetics',
       u'IHOP', u'netflix', u'Target', u'Sears', u'Macys', u'Gap', u'Nike',
       u'Fandango', u'Forever21', u'nikestore', u'HomeDepot', u'Walmart',
       u'Lowes', u'BestBuy', u'SUBWAY', u'lululemon', u'BananaRepublic',
       u'ChickfilA', u'QVC', u'zappos', u'Ticketmaster', u'VictoriasSecret',
       u'ChipotleTweets', u'jcpenney', u'PetSmart', u'hm', u'IKEAUSA',
       u'bathbodyworks', u'UrbanOutfitters', u'Kohls', u'Nordstrom',
       u'CVS_Extra', u'tjmaxx', u'DunkinDonuts', u'BNBuzz', u'ContainerStore',
       u'Cinemark', u'hotelsdotcom', u'Starbucks', u'Aeropostale',
       u'Cheesecake', u'ULTA_Beauty', u'REI', u'McDonalds', u'SportsAuthority',
       u'MichaelsStores', u'ToysRUs', u'amazon', u'Etsy', u'redlobster',
       u'BabiesRUs', u'aeriagames', u'iTunes', u'WholeFoods', u'GameStop',
       u'tacobell', u'olivegarden'],
   

In [None]:
mini_retailers_df['company'].value_counts()

In [41]:
#turn each company's tweet history into a document
com_docs = []
for company in company_list:
    a = ' '.join([text for text in mini_retailers_df[mini_retailers_df['company']==company].stemmed_text.values])
    com_docs.append(a)

In [None]:
' '.join([text for text in mini_retailers_df[mini_retailers_df['company']=='Disney'].stemmed_text.values])

In [None]:
#TFIDF for each company
company_vects = defaultdict(list)
for company in company_list:
    documents = mini_retailers_df[mini_retailers_df['company'] == company].stemmed_text
    vect_name = 'tfidf_'+company
    print documents.values[0]
    company_vects[vect_name] = tfidf.fit_transform(documents)

In [None]:
company_vects['tfidf_Disney']

### Take a shard of users and their tweets 

In [11]:
import pymongo
from pymongo import MongoClient
mc = MongoClient()
db = mc.twitter_db

input_data = db.tweets_by_users
users = pd.DataFrame(list(input_data.find().limit(50000)))

In [12]:
users['user_name'] = users['user'].apply(lambda x: x['screen_name'].encode('ascii'))

In [13]:
users['user_name'].value_counts()

USUAggie1990      3232
chuck_haze        3229
emiiistyles       3225
swnseanews        3224
chunkypots        3221
ChristinaAAAHD    3215
quiwop            3213
nakiyarussell     3199
lovecoutureee_    3194
__Ganjaaa         3175
_justolentino     3166
matt_brownlee8    3142
YZAKALLO          3080
trippnationn      3017
Itss_angel_cx     2782
yungjosey         2388
JULIANAMEGGERS     197
RadtkeKristin      100
royray0              1
Name: user_name, dtype: int64

In [14]:
important_columns = ['user_name', 'text']
user_df = users.loc[: , important_columns]

In [15]:
user_df

Unnamed: 0,user_name,text
0,chunkypots,@Cheesecake for dinner 💯 https://t.co/Ce3hzP8F0s
1,chunkypots,@thetenspot awesome!!!
2,chunkypots,@stajans_girl @pkumaaar_ LMAO YESSSSS
3,chunkypots,💖 https://t.co/cOLa2JYvUC
4,chunkypots,@stajans_girl I bought Pooj chocolate dipped O...
5,chunkypots,@stajans_girl YO I need to see you soon! Just ...
6,chunkypots,Mulberry Cafe has the cutest tree!! https://t....
7,chunkypots,Any bobs burger reference makes the fam strong...
8,chunkypots,@stajans_girl I want to knit just so I can say...
9,chunkypots,@thetenspot what's the name of the quick dry s...


In [16]:
# should rename to stripped_text

user_df['simple_text'] = user_df['text'].apply(lambda x: remove_links(x)) 

In [17]:
user_df

Unnamed: 0,user_name,text,simple_text
0,chunkypots,@Cheesecake for dinner 💯 https://t.co/Ce3hzP8F0s,Cheesecake for dinner
1,chunkypots,@thetenspot awesome!!!,@thetenspot awesome!!!
2,chunkypots,@stajans_girl @pkumaaar_ LMAO YESSSSS,@stajans_girl @pkumaaar_ LMAO YESSSSS
3,chunkypots,💖 https://t.co/cOLa2JYvUC,
4,chunkypots,@stajans_girl I bought Pooj chocolate dipped O...,@stajans_girl I bought Pooj chocolate dipped O...
5,chunkypots,@stajans_girl YO I need to see you soon! Just ...,@stajans_girl YO I need to see you soon! Just ...
6,chunkypots,Mulberry Cafe has the cutest tree!! https://t....,Mulberry Cafe has the cutest tree
7,chunkypots,Any bobs burger reference makes the fam strong...,Any bobs burger reference makes the fam strong...
8,chunkypots,@stajans_girl I want to knit just so I can say...,@stajans_girl I want to knit just so I can say...
9,chunkypots,@thetenspot what's the name of the quick dry s...,@thetenspot what's the name of the quick dry s...


### Analyze sentiment for each tweet

#### nltk.sentiment.vader is used because it takes into consideration special case idioms, capitalization, punctuation, and deals specifically with social media texts

VADER (Valence Aware Dictionary and sEntiment Reasoner) 

source code: http://www.nltk.org/_modules/nltk/sentiment/vader.html

Github: https://github.com/cjhutto/vaderSentiment

VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text 
(by C.J. Hutto and Eric Gilbert) 
Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.  http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf

In [18]:
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [19]:
sid = SentimentIntensityAnalyzer()
user_df['sentiment'] = user_df['text'].apply(lambda x: sid.polarity_scores(x))

In [20]:
user_df['sentiment']

0        {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp...
1        {u'neg': 0.0, u'neu': 0.167, u'pos': 0.833, u'...
2        {u'neg': 0.0, u'neu': 0.393, u'pos': 0.607, u'...
3        {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp...
4        {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp...
5        {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp...
6        {u'neg': 0.0, u'neu': 0.578, u'pos': 0.422, u'...
7        {u'neg': 0.0, u'neu': 0.755, u'pos': 0.245, u'...
8        {u'neg': 0.0, u'neu': 0.92, u'pos': 0.08, u'co...
9        {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp...
10       {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp...
11       {u'neg': 0.0, u'neu': 0.602, u'pos': 0.398, u'...
12       {u'neg': 0.0, u'neu': 0.74, u'pos': 0.26, u'co...
13       {u'neg': 0.0, u'neu': 0.93, u'pos': 0.07, u'co...
14       {u'neg': 0.0, u'neu': 0.828, u'pos': 0.172, u'...
15       {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp...
16       {u'neg': 0.0, u'neu': 1.0, u'pos': 0.0, u'comp.

#### Keeping only tweets classified as positive

In [21]:
pos_user_tweets = user_df[user_df['sentiment'].apply(lambda x: x['pos']) > .49]

In [22]:
pos_user_tweets

Unnamed: 0,user_name,text,simple_text,sentiment
1,chunkypots,@thetenspot awesome!!!,@thetenspot awesome!!!,"{u'neg': 0.0, u'neu': 0.167, u'pos': 0.833, u'..."
2,chunkypots,@stajans_girl @pkumaaar_ LMAO YESSSSS,@stajans_girl @pkumaaar_ LMAO YESSSSS,"{u'neg': 0.0, u'neu': 0.393, u'pos': 0.607, u'..."
20,chunkypots,Always the BEST service and amazing manicure a...,Always the BEST service and amazing manicure a...,"{u'neg': 0.0, u'neu': 0.458, u'pos': 0.542, u'..."
28,chunkypots,@rohinghai friendship over.,@rohinghai friendship over.,"{u'neg': 0.0, u'neu': 0.408, u'pos': 0.592, u'..."
41,chunkypots,@JovanHeer lol!! Haha okay that's fair,@JovanHeer lol!! Haha okay that's fair,"{u'neg': 0.0, u'neu': 0.159, u'pos': 0.841, u'..."
50,chunkypots,@rohinghai LMAO oh god,@rohinghai LMAO oh god,"{u'neg': 0.0, u'neu': 0.229, u'pos': 0.771, u'..."
51,chunkypots,@rohinghai LMAO how did you just have that on ...,@rohinghai LMAO how did you just have that on ...,"{u'neg': 0.0, u'neu': 0.505, u'pos': 0.495, u'..."
53,chunkypots,@rohinghai you'll need fair n lovely,@rohinghai you'll need fair n lovely,"{u'neg': 0.0, u'neu': 0.33, u'pos': 0.67, u'co..."
73,chunkypots,Drinks n burgers after work with my best frien...,Drinks n burgers after work with my best frien...,"{u'neg': 0.0, u'neu': 0.444, u'pos': 0.556, u'..."
90,chunkypots,@rohinghai lmao I clicked reply,@rohinghai lmao I clicked reply,"{u'neg': 0.0, u'neu': 0.435, u'pos': 0.565, u'..."


In [23]:
pos_user_tweets['user_name'].value_counts()

_justolentino     324
nakiyarussell     241
emiiistyles       218
quiwop            171
chunkypots        160
trippnationn      141
lovecoutureee_    132
Itss_angel_cx     121
USUAggie1990      114
ChristinaAAAHD    102
yungjosey          99
matt_brownlee8     78
YZAKALLO           77
chuck_haze         72
__Ganjaaa          56
JULIANAMEGGERS      6
swnseanews          2
RadtkeKristin       2
Name: user_name, dtype: int64

#### Keeping only tweets classified as non negative

In [24]:
nneg_user_tweets = user_df[user_df['sentiment'].apply(lambda x: x['neg']) < .49]

In [25]:
nneg_user_tweets['user_name'].value_counts()

swnseanews        3224
emiiistyles       3203
ChristinaAAAHD    3189
USUAggie1990      3187
chunkypots        3153
chuck_haze        3151
nakiyarussell     3127
__Ganjaaa         3117
lovecoutureee_    3115
_justolentino     3100
quiwop            3081
matt_brownlee8    3063
YZAKALLO          2984
trippnationn      2907
Itss_angel_cx     2697
yungjosey         2242
JULIANAMEGGERS     196
RadtkeKristin      100
royray0              1
Name: user_name, dtype: int64

### Time to stem each tweet using snowball

In [26]:
pos_user_tweets['stemmed_text'] = pos_user_tweets['simple_text'].apply(lambda sent: ' '.join([snowball.stem(word) for \
                                    word in sent.translate(translation_table).split()]))
nneg_user_tweets['stemmed_text'] = nneg_user_tweets['simple_text'].apply(lambda sent: ' '.join([snowball.stem(word) \
                                   for word in sent.translate(translation_table).split()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


### Create a list of users

In [27]:
user_list = nneg_user_tweets['user_name'].value_counts().index
user_list

Index([u'swnseanews', u'emiiistyles', u'ChristinaAAAHD', u'USUAggie1990',
       u'chunkypots', u'chuck_haze', u'nakiyarussell', u'__Ganjaaa',
       u'lovecoutureee_', u'_justolentino', u'quiwop', u'matt_brownlee8',
       u'YZAKALLO', u'trippnationn', u'Itss_angel_cx', u'yungjosey',
       u'JULIANAMEGGERS', u'RadtkeKristin', u'royray0'],
      dtype='object')

In [None]:
# user_vects = defaultdict(list)
# for user in user_list:
#     documents = pos_user_tweets[pos_user_tweets['user_name'] == user].stemmed_text
#     vect_name = 'tfidf_'+user
#     #print documents.values[0]
#     user_vects[vect_name] = tfidf.fit_transform(documents)

In [28]:
user_docs = []
for user in user_list:
    a = ' '.join([text for text in nneg_user_tweets[nneg_user_tweets['user_name']==user].stemmed_text.values])
    user_docs.append(a)

### Multinomial Naive Bayes

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [57]:
user_docs_pos = []
for user in user_list:
    a = ' '.join([text for text in pos_user_tweets[pos_user_tweets['user_name']== user].text.values])
    user_docs_pos.append(a)

In [58]:
user_docs_nneg = []
for user in user_list:
    a = ' '.join([text for text in nneg_user_tweets[nneg_user_tweets['user_name']== user].text.values])
    user_docs_nneg.append(a)

In [38]:
mini_retailers_df[mini_retailers_df['company'] == 'BNBuzz']

Unnamed: 0,company,text,simple_text,stemmed_text
214464,BNBuzz,Getting pumped for our #BNGiftRec chat with @J...,Getting pumped for our BNGiftRec chat with it ...,get pump for our bngiftrec chat with it start ...
214465,BNBuzz,Shopping for a teen? We bet these page-and-scr...,Shopping for a teen We bet these pageandscreen...,shop for a teen we bet these pageandscreen boo...
214466,BNBuzz,Santa is NOT an easy character to catch! Bring...,Santa is NOT an easy character to catch Bring ...,santa is not an easi charact to catch bring th...
214467,BNBuzz,RT @BNTeens: Jane Austen's EMMA is 200 this mo...,Jane Austens EMMA is 200 this month Celebrat...,jane austen emma is 200 this month celebr with...
214468,BNBuzz,RT @tyleroakley: thanks Barnes &amp; Noble for...,thanks Barnes amp Noble for listing BINGE as...,thank barn amp nobl for list bing as one of no...
214469,BNBuzz,#StarWars fans! We have gift ideas for Jedi of...,StarWars fans We have gift ideas for Jedi of a...,starwar fan we have gift idea for jedi of all ...
214470,BNBuzz,RT @Italian_Movies: Don't forget Barnes and No...,Dont forget Barnes and Noble for great Itali...,dont forget barn and nobl for great italian dv...
214471,BNBuzz,RT @BNTeens: 15 of our most anticipated YA deb...,15 of our most anticipated YA debuts of 2016,15 of our most anticip ya debut of 2016
214472,BNBuzz,RT @bradtaylorbooks: If you're unable to make ...,If youre unable to make it to one of my stop...,if your unabl to make it to one of my stop on ...
214473,BNBuzz,RT @cabinporn: Looking for a last minute copy ...,Looking for a last minute copy of the book ...,look for a last minut copi of the book get fre...


In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
X = mini_retailers_df['stemmed_text']
y = mini_retailers_df['company']
count_vectorizer = CountVectorizer(stop_words='english')
tf = count_vectorizer.fit_transform(X)
clf = MultinomialNB()
clf.fit(tf, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
# test for a single tweet, should result in aeriagames
user1 = count_vectorizer.transform(["congratul for the awesom guardian cosplay"])

pred1 = clf.predict(user1)
pred1[0]

'aeriagames'

In [68]:
for user in user_docs_pos:
    X1 = count_vectorizer.transform([user])
    pred = clf.predict(X1)
    print pred[0]

amazon
BabiesRUs
BNBuzz
SportsAuthority
zappos
BNBuzz
BabiesRUs
BNBuzz
BNBuzz
BedBathBeyond
jcpenney
ToysRUs
BNBuzz
ToysRUs
ToysRUs
Staples
ToysRUs
IHOP
BedBathBeyond


### Use Cosine Similarity to compare documents of user tweets to company tweets

This kernel is a popular choice for computing the similarity of documents represented as tf-idf vectors. cosine_similarity accepts scipy.sparse matrices. (Note that the tf-idf functionality in sklearn.feature_extraction.text can produce normalized vectors, in which case cosine_similarity is equivalent to linear_kernel, only slower.)

In [39]:
from sklearn.metrics.pairwise import linear_kernel 
import numpy as np

In [62]:
# With positive tweets only

from copy import copy

vect = TfidfVectorizer(stop_words='english')
i = 1
for user_tweets in user_docs_pos:
    all_docs = copy(com_docs)
    all_docs.append(user_tweets)

    X = vect.fit_transform(all_docs)
    M = linear_kernel(X, X)
    sortedM = np.argsort(M[-1])[::-1][1:3]
    a, b = company_list[sortedM]
    numA, numB = M[-1][sortedM]
    print "{}) Top matches: {} ({}%) and {} ({}%)".format(i, a, round(numA * 100, 2), b, round(numB * 100, 2))
    i += 1

1) Top matches: Aeropostale (2.0%) and amazon (1.96%)
2) Top matches: lushcosmetics (11.41%) and BananaRepublic (9.38%)
3) Top matches: lushcosmetics (16.82%) and lululemon (13.29%)
4) Top matches: SportsAuthority (11.27%) and lululemon (9.92%)
5) Top matches: lululemon (5.5%) and lushcosmetics (5.23%)
6) Top matches: lululemon (6.09%) and SportsAuthority (5.45%)
7) Top matches: lushcosmetics (14.33%) and BananaRepublic (11.75%)
8) Top matches: lushcosmetics (6.38%) and BananaRepublic (5.31%)
9) Top matches: lushcosmetics (13.68%) and BananaRepublic (11.0%)
10) Top matches: BedBathBeyond (8.58%) and BananaRepublic (4.63%)
11) Top matches: lushcosmetics (11.82%) and BananaRepublic (9.9%)
12) Top matches: lushcosmetics (8.7%) and BananaRepublic (8.23%)
13) Top matches: lululemon (7.76%) and lushcosmetics (6.87%)
14) Top matches: lululemon (6.5%) and lushcosmetics (5.84%)
15) Top matches: lushcosmetics (5.12%) and BananaRepublic (4.79%)
16) Top matches: BedBathBeyond (3.55%) and lululemon

In [63]:
# With nonnegative tweets

from copy import copy

vect = TfidfVectorizer(stop_words='english')
i = 1
for user_tweets in user_docs_nneg:
    all_docs = copy(com_docs)
    all_docs.append(user_tweets)

    X = vect.fit_transform(all_docs)
    M = linear_kernel(X, X)
    sortedM = np.argsort(M[-1])[::-1][1:3]
    a, b = company_list[sortedM]
    numA, numB = M[-1][sortedM]
    print "{}) Top matches: {} ({}%) and {} ({}%)".format(i, a, round(numA * 100, 2), b, round(numB*100, 2))
    i += 1

1) Top matches: Disney (0.71%) and amazon (0.66%)
2) Top matches: hotelsdotcom (3.76%) and SportsAuthority (3.05%)
3) Top matches: lululemon (6.29%) and SportsAuthority (4.72%)
4) Top matches: SportsAuthority (7.41%) and AppStore (6.43%)
5) Top matches: lululemon (7.44%) and SportsAuthority (6.03%)
6) Top matches: lululemon (5.41%) and hotelsdotcom (4.96%)
7) Top matches: lululemon (5.99%) and SportsAuthority (5.28%)
8) Top matches: hotelsdotcom (3.46%) and lululemon (2.39%)
9) Top matches: lululemon (4.24%) and hotelsdotcom (4.2%)
10) Top matches: BedBathBeyond (4.67%) and BananaRepublic (2.66%)
11) Top matches: lululemon (7.45%) and SportsAuthority (6.87%)
12) Top matches: hotelsdotcom (3.93%) and lululemon (3.73%)
13) Top matches: lululemon (7.13%) and SportsAuthority (5.94%)
14) Top matches: lululemon (4.12%) and hotelsdotcom (4.09%)
15) Top matches: hotelsdotcom (3.5%) and lululemon (3.29%)
16) Top matches: lululemon (4.42%) and hotelsdotcom (4.03%)
17) Top matches: lululemon (6.4

In [None]:
# cosine similartity between stores & 1 user
test_doc = copy(com_docs)
test_doc.append(user_docs[0])

vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(test_doc)
M = linear_kernel(X, X)
sortedM = np.argsort(M[-1])[::-1][1:3]
a, b = company_list[sortedM]
print a, b
print M[-1]

In [None]:
# top features

indices = np.argsort(vect.idf_)[::-1]
features= vect.get_feature_names()
top_n = 20
top_features = [features[i] for i in indices[:top_n]]
print top_features

In [None]:
company_list[sortedM2]

In [None]:
sortedM2 = np.argsort(M[-1])[::-1][1:3]

### Trying out OneVsRestClassifier

In [52]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC # using LinearSVC since it works well with a large number of features

In [53]:
X = mini_retailers_df['stemmed_text']
y = mini_retailers_df['company']
tfid_vectorizer = TfidfVectorizer(stop_words='english')
tf3 = tfid_vectorizer.fit_transform(X)
OvR = OneVsRestClassifier(LinearSVC(random_state=0))
OvR.fit(tf3, y)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [143]:
for user in user_docs_pos:
    X1 = tfid_vectorizer.transform([user])
    pred = OvR.predict(X1)
    print pred

['Ticketmaster']
['UrbanOutfitters']
['hotelsdotcom']
['SportsAuthority']
['zappos']
['hotelsdotcom']
['hotelsdotcom']
['hotelsdotcom']
['hotelsdotcom']
['BedBathBeyond']
['zappos']
['hotelsdotcom']
['hotelsdotcom']
['hotelsdotcom']
['UrbanOutfitters']
['BedBathBeyond']
['UrbanOutfitters']
['IHOP']
['ChickfilA']


In [140]:
user = "love music macbook album rap pop billboard"
X1 = tfid_vectorizer.transform([user])
pred = OvR.predict_proba(X1)
print pred

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

### Assess accuracy of the models by splitting up training data (stores)

In [70]:
from sklearn.cross_validation import train_test_split

In [97]:
X = mini_retailers_df['stemmed_text']
y = mini_retailers_df['company']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [82]:
# MultinomialNB

count_vectorizer = CountVectorizer(stop_words='english')
tf_nb = count_vectorizer.fit_transform(X_train)
clf = MultinomialNB()
clf.fit(tf_nb, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [86]:
test_cv = count_vectorizer.transform(X_test)

clf.score(test_cv, y_test)

0.60408222350897511

In [87]:
# OneVsRest

tfid_vectorizer = TfidfVectorizer(stop_words='english')
tf_OvR = tfid_vectorizer.fit_transform(X_train)
OvR = OneVsRestClassifier(LinearSVC(random_state=0))
OvR.fit(tf_OvR, y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [88]:
test_OvR = tfid_vectorizer.transform(X_test)

OvR.score(test_OvR, y_test)

0.69554622659718202

### Looks like OvR is better than NB, but lets test with KFolds CV

In [89]:
from sklearn.cross_validation import KFold

In [94]:
n = len(y) # 207237
kf = KFold(n, n_folds=7, shuffle=True, random_state=7)

In [137]:
# iloc is used because some indeces are skipped, can see difference with max(index) & len(index)
# can fix with X.index = range(len(X))

for train_index, test_index in kf:
    test = train_index
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    count_vectorizer = CountVectorizer(stop_words='english')
    tf_nb = count_vectorizer.fit_transform(X_train)
    clf = MultinomialNB()
    clf.fit(tf_nb, y_train)
    
    tfid_vectorizer = TfidfVectorizer(stop_words='english')
    tf_OvR = tfid_vectorizer.fit_transform(X_train)
    OvR = OneVsRestClassifier(LinearSVC(random_state=0))
    OvR.fit(tf_OvR, y_train)
    
    test_cv = count_vectorizer.transform(X_test)
    test_OvR = tfid_vectorizer.transform(X_test)
    
    print "MultinomialNB score: {} \nOvR score: {}\n".format(clf.score(test_cv, y_test), OvR.score(test_OvR, y_test))

MultinomialNB score: 0.602648111869 
OvR score: 0.69594001216

MultinomialNB score: 0.609977707222 
OvR score: 0.700094575424

MultinomialNB score: 0.604526262456 
OvR score: 0.698395541294

MultinomialNB score: 0.605100489782 
OvR score: 0.69863198784

MultinomialNB score: 0.604424928222 
OvR score: 0.700287113663

MultinomialNB score: 0.605269380172 
OvR score: 0.692788380341

MultinomialNB score: 0.607093396386 
OvR score: 0.69829420706



#### Conclusion: OvR is consistently more accurate than MultinomialNB by ~10%

In [113]:
len(X)

207237

In [132]:
X.iloc[[0, 1, 3, 207211]]

0         latest dragomon hunter updat introduc 300 vers...
1         swear allegi to the free trapper or warden fed...
3          congratul for your eo costum and prize so awesom
217648    when halloween came i assum that the whole fam...
Name: stemmed_text, dtype: object

In [133]:
X[[0, 1, 3, 207211]]

0         latest dragomon hunter updat introduc 300 vers...
1         swear allegi to the free trapper or warden fed...
3          congratul for your eo costum and prize so awesom
207211                                                     
Name: stemmed_text, dtype: object

In [135]:
max(X.index)

217673

In [136]:
len(X)

207237