In [4]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Strip non-alphanumeric from string
def strip_nonalnum(s):
    return "".join([c for c in s if c.isalnum() or c.isspace()])



In [5]:
sns.set(color_codes=True)

#Create a starting dataframe of review data.
reviews = pd.read_csv("data/yelp_academic_dataset_review.csv",nrows=4000)
reviews.describe()


# Drop all reviews without a score
reviews['stars'].dropna(axis=0,how='any',inplace=True)
# Newlines to spaces.
reviews['text'] = reviews['text'].apply(lambda x : re.sub('\n',' ',x) )
# Remove non-alphanumeric (or space) characters from all review texts
reviews['text'] = reviews['text'].apply(strip_nonalnum)
reviews['positivity'] = reviews['stars'].apply(lambda x : 1 if x > 3 else 0)





In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#vectorizer = CountVectorizer(max_features = 1000,
#                             ngram_range=(1, 2),
#                             stop_words='english',
#                             binary=True)
vectorizer = TfidfVectorizer()


In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

model = RandomForestClassifier(n_estimators = 20)

text = reviews['text'].fillna('')
vectorizer.fit(text)

#- Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(text)

features = vectorizer.get_feature_names()
reviews['positivity'] = reviews['stars'].apply(lambda x : 1 if x > 3 else 0)
y = reviews['positivity']



scores = cross_val_score(model, X, y, scoring='roc_auc')

print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))
print X





CV AUC [ 0.78394871  0.75358665  0.80180246], Average AUC 0.779779274628
  (0, 19754)	0.0914966332144
  (0, 19748)	0.214376364947
  (0, 19410)	0.0856966876108
  (0, 19408)	0.0661033809114
  (0, 19119)	0.12971508709
  (0, 19055)	0.206113410239
  (0, 18918)	0.0591095961859
  (0, 18740)	0.10154849922
  (0, 17941)	0.0645937249775
  (0, 17826)	0.200297420379
  (0, 17693)	0.0633754449272
  (0, 17685)	0.227999094047
  (0, 17093)	0.200297420379
  (0, 16504)	0.206113410239
  (0, 15539)	0.0908628294863
  (0, 15517)	0.119732817638
  (0, 14876)	0.140283086674
  (0, 12836)	0.184977411072
  (0, 12482)	0.103380439956
  (0, 12414)	0.0579550360933
  (0, 12398)	0.0703339155477
  (0, 12340)	0.0811703407166
  (0, 12181)	0.0978187340052
  (0, 12109)	0.107112081464
  (0, 12074)	0.1359691359
  :	:
  (3999, 10843)	0.164794056759
  (3999, 10116)	0.137685616249
  (3999, 9919)	0.0970967138043
  (3999, 9409)	0.118168853006
  (3999, 8534)	0.155674441906
  (3999, 8404)	0.0564377265488
  (3999, 8075)	0.200392529102


In [59]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    
    top_n_ids = np.argsort(row)[::-1][:top_n] 
    print top_n_ids.shape
    top_feats = [(features[i], row[i]) for i in top_n_ids]
    print top_n_ids
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df
    
foo =top_tfidf_feats(X[2], features)
print foo
    
def top_tfidf_feats_in_doc(Xtr, features, row_id, top_n=5):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(np.asarray(Xtr[row_id]).flatten())
    print row
    
    
    return top_tfidf_feats(row, features, top_n)


def top_mean_feats(Xtr, features, min_tfidf=0.01, top_n=5):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    D = Xtr.toarray()
    
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    
    return top_tfidf_feats_in_doc(tfidf_means, features, top_n)

#print top_tfidf_feats_in_doc(X,features,1)

#print top_mean_feats(X,features)



(1L,)
[0]
  feature                                              tfidf
0      00    (0, 19827)\t0.0833081581908\n  (0, 19777)\t0...
