In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import gzip
%matplotlib inline
from sklearn.svm import LinearSVC

In [2]:
class w2v():

    def __init__(self, path_to_vectors="/Users/joshuamalina/data/movies/glove.6B.50d.txt", dim=50):
        self.path = path_to_vectors
        self.w2v = {}
        self.inflate()
        assert(dim==len(self.w2v['the']))
    
    def inflate(self):
        with open(self.path, "r") as f:
            ls = f.readlines()
            for l in ls:
                parts = l.split(' ')
                word = parts[0]
                embedding = map(lambda x: float(x.replace('\n', '')), parts[1:])
                self.w2v[word] = np.array(list(embedding))
    
    def textToEmbMean(self, text):
        tokens = text.split(' ')
        mapped = [self.w2v[w] for w in tokens if w in self.w2v]
        if len(mapped) > 0:
            return np.mean(mapped, axis=0)
        else:
            return np.zeros(50)        

In [7]:
class corpus():
    
    def __init__(self, path='/Users/joshuamalina/data/movies/reviews.json'):
        self.df = self.getDF(path)
    
    def parse(self, path):
      g = open(path, 'r')
      for l in g:
        yield eval(l)

    def getDF(self, path):
      i = 0
      df = {}
      for d in self.parse(path):
        df[i] = d
        i += 1
      return pd.DataFrame.from_dict(df, orient='index')

In [8]:
class data():
    
    def __init__(self, df, w2v_instance):
        self.df = df
        self.w2v = w2v_instance
        self.X = self.getX()
        self.Y = self.getY()
        assert(len(self.X) == len(self.Y))
        
    def getX(self, clean=lambda x: x):
        return list(map(lambda x: self.w2v.textToEmbMean(clean(x)), self.df.reviewText))
    
    def getY(self):
        return self.df.overall

In [9]:
class model():
    
    def __init__(self, X, Y, model=LinearSVC()):
        self.X = X
        self.Y = Y
        self.model=model
        
    def trainTestSplit(self):
        self.X_train, \
        self.X_test, \
        self.y_train, \
        self.y_test = train_test_split(self.X, self.Y, test_size=0.2, random_state=0)
        
    def fit(self):
        self.model.fit(self.X, self.Y)

In [10]:
w2v_ = w2v()
corpus_ = corpus()
data_ = data(corpus_.df, w2v_)
X = data_.getX()
Y = data_.getY()

In [11]:
model_ = model(X, Y)
model_.fit()

In [18]:
from sklearn.externals import joblib

In [19]:
joblib.dump(model_.model, 'model.pkl')

['model.pkl']

In [20]:
m = joblib.load('model.pkl')

In [21]:
m.predict(X[0])



array([ 5.])