In [None]:
%matlabplot 

In [None]:
lsmagic

In [None]:
%%latex
$e^{-(x-\mu)^2/2\sigma^2}$

In [1]:
import pandas as pd
import numpy as np
from numpy import linalg
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
import math
import matplotlib.pyplot as plt
from sklearn import metrics
from time import time
from scipy.spatial.distance import euclidean, hamming
from sklearn import cross_validation,linear_model

def logistic_regression(train_data, train_labels, test_data, test_labels):
   ##Logistic Regression
   cls = linear_model.LogisticRegression()
   cls.fit(train_data, train_labels)
   model_score = cls.score(test_data, test_labels)
   model_predictions = cls.predict(test_data)
   print("\nAccuracy of logistic regression in predicting sentiments: "+str(model_score))
   print("printing confusion matrix of Logistic Regression")
   print(metrics.confusion_matrix(test_labels, model_predictions))

stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

def finite_df(f):
    df = pd.read_table(f, header=None)
    return df[np.isfinite(df[1])]
def lwnl(y):
    res = []
    for x in y:
        try:
            res.append(wnl.lemmatize(x.encode('ascii', 'ignore'), 'v'))
        except UnicodeDecodeError:
            break
    return res

            
def preprocess(df):
    df[0] = df[0].str.strip().str.lower()  # simple cleanup
    lascii = lambda y : all(ord(c) < 128 for c in y)
    #lwnl = lambda y: [try wnl.lemmatize(x, 'v') for x in y] # lemmatize lambda
    lstop = lambda y : [x for x in y if x not in stops] # removing stops
    df[2] = df[0].map(tokenizer.tokenize)  # tokenize and storing in a new col
    df[2] = df[2].map(lstop)
    df[2] = df[2].map(lwnl)

def split_train_test(df):
    df1 = df.loc[df[1] == 1]
    df0 = df.loc[df[1] == 0]
    return pd.concat([df1[:400], df0[:400]]), pd.concat([df1[400:], df0[400:]])

# x is the row of df
def fv_update(x, words, test_words):
    for w in x[2]:
        try:
            i = words.index(w)
            x['fv'][i] = x['fv'][i] + 1
        except ValueError, KeyError:
            test_words.update([w])

            
def bow(df, words, test_words):
    df['fv'] = np.zeros((len(df), len(words)), dtype=np.int8).tolist()
    df.apply(lambda x : fv_update(x, words, test_words), axis=1)

        
def norm (df):
    df['log'] = df['fv']
    df['log'] = df['log'].apply(lambda x: [math.log(y+1) for y in x]) # copy the col to new index
    df['l1'] = df['fv']
    df['l1'] = df['l1'].apply(lambda x: np.array(x)/linalg.norm(x, ord=1))
    df['l2'] = df['fv']
    df['l2'] = df['l2'].apply(lambda x: np.array(x)/linalg.norm(x, ord=2))
    df['std'] = df['fv']
    df['std'] = df['std'].apply(lambda x: (np.array(x) - np.mean(x))/np.var(x))

def numpy_to_df(df, idx):
    npa = []
    df[idx].apply(lambda x: npa.append(x))
    return pd.DataFrame(npa)
    
def kmeans_score(estimator, data, labels, name):
    t0 = time()
    estimator.fit(data)
    # Taken from scikit-learn digits k-means classification
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=len(data))))
    print "Dissimilarity measure "
    print (hamming(np.array(labels), np.array(estimator.labels_))) 

if __name__ == '__main__':
    wnl = WordNetLemmatizer()
    amazon_df = finite_df('sentiment_data//amazon_cells_labelled.txt')
    imdb_df   = finite_df('sentiment_data//imdb_labelled.txt')
    yelp_df   = finite_df('sentiment_data//yelp_labelled.txt')
    df_dict = {'Amazon': amazon_df, 'IMDB':imdb_df, 'Yelp':yelp_df}
    dfs = [amazon_df, imdb_df, yelp_df]
    
    print "Ratio of labels by taking #0/#Total and #1/#total "
    print [df[1].value_counts(normalize=True, ascending=True) for df in dfs]

    [preprocess(x) for x in dfs]   
    
    # train_test_dfs : tuple of ((train, test)...)
    # creating train_dfs, test_dfs views of the data
    train_test_dfs = [split_train_test(x) for x in dfs]
    # collating the train and test together
    train_dfs, test_dfs = (pd.concat([train_test_dfs[0][0], train_test_dfs[1][0], train_test_dfs[0][0]]
                                     , copy=False, ignore_index=True),
                          pd.concat([train_test_dfs[0][1], train_test_dfs[1][1], train_test_dfs[0][1]]
                                    , copy=False, ignore_index=True))

    train_random_dfs = train_dfs.reindex(np.random.permutation(train_dfs.index))
    
    # Loop over all the training data and build a set of words which
    # represents the feature 
    dict_words = set()
    test_words = set()
    train_dfs[2].map(lambda x : dict_words.update(x))
    test_dfs[2].map(lambda x : test_words.update(x))
    #print "Train shape " + str(train_dfs.shape)
    #print "Test shape " + str(test_dfs.shape)
    
    # create feature vectors with word frequency using the index from words
    words = list(dict_words)
    #print "Rd d is " + str(len(words))
    check_test_words = set() # these are the unique words in test not in training set
    bow(train_dfs, words, check_test_words)
    bow(test_dfs, words, check_test_words)
    
    #print ((test_words - dict_words) >= check_test_words) and ((test_words - dict_words) <= check_test_words)
    print "Review 1"
    print [(i,e) for i, e in enumerate(train_dfs['fv'][0]) if e != 0]
    print "Review 2"
    print [(i,e) for i, e in enumerate(train_dfs['fv'][1]) if e != 0]
    
    norm(train_dfs)
    norm(test_dfs)
    
    print train_dfs[:3].head()


Ratio of labels by taking #0/#Total and #1/#total 
[0    0.5
1    0.5
Name: 1, dtype: float64, 0    0.483957
1    0.516043
Name: 1, dtype: float64, 0    0.5
1    0.5
Name: 1, dtype: float64]
Review 1
[(142, 1), (1669, 1), (1898, 1), (3072, 1)]
Review 2
[(2583, 1), (3357, 1)]
                             0  1                               2  \
0  good case, excellent value.  1  [good, case, excellent, value]   
1       great for the jawbone.  1                [great, jawbone]   
2            the mic is great.  1                    [mic, great]   

                                                  fv  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                                 log  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  

In [2]:
    df_train_std = numpy_to_df(train_dfs, 'std')
    df_test_std = numpy_to_df(test_dfs, 'std')
    df_train_log = numpy_to_df(train_dfs, 'log')
    df_test_log = numpy_to_df(test_dfs, 'log')
    df_train_l1 = numpy_to_df(train_dfs, 'l1')
    df_test_l1 = numpy_to_df(test_dfs, 'l1')
    df_train_l2 = numpy_to_df(train_dfs, 'l2')
    df_test_l2 = numpy_to_df(test_dfs, 'l2')
    
    #kmeans_score(KMeans(n_clusters=2, n_init=100), df_train_std, train_dfs[1], 'scikit-kmeans')
    #kmeans_score(KMeans(n_clusters=2, n_init=100), df_train_log, train_dfs[1], 'scikit-kmeans')
    #kmeans_score(KMeans(n_clusters=2, n_init=100), df_train_l1, train_dfs[1], 'scikit-kmeans')
    #kmeans_score(KMeans(n_clusters=2, n_init=100), df_train_l2, train_dfs[1], 'scikit-kmeans')
    #logistic_regression(df_train_log, train_dfs[1], df_test_log, test_dfs[1])


In [26]:
def helper (x, clusters, means):
    idx = np.argmin([euclidean(x,y) for y in means])
    clusters[idx].append([x])
    return idx

def kmeans(data, labels, k):
    # Randomly select the cluster means
    means = data.sample(k).values
    clusters = {}
    cluster_assignments = pd.Series(labels)
    indices = pd.Series(np.zeros(len(data)))
    cnt = 10
    #while(not all(cluster_assignments.values == indices.values)):
    while (cnt > 0):
        cluster_assignments = indices
        for i,v in enumerate(means):
            clusters[i] = []
        indices = data.apply(
            lambda x : helper(x.values, clusters, means), axis = 1)
        means = [np.mean(np.array(v), axis=0) ]
        for i, v in clusters.iteritems():
            
        print len(means[0])
        cnt = cnt -1
    for i, v in clusters.iteritems():
        print i, len(v)
    print cluster_assignments
    print hamming(cluster_assignments.values, labels)
    
kmeans(df_train_std.sample(10), train_dfs[1].sample(10), 2)
#kmeans(pd.DataFrame([[1,2], [1,1], [2,1], [2,2]]), [0, 0, 1, 1], 2)

1
1


ValueError: ('array must not contain infs or NaNs', u'occurred at index 52')