In [14]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

def review_to_words( review, remove_stopwords=False ):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    stemmer = SnowballStemmer("english")
    words = [stemmer.stem(w) for w in words]
    
    return(words)

it is Abhishek's Evergreen model. It uses tfidf on the full dataset to vectorize the input words, then a Logisitic regression model to predict the output scores. CV/LB score ~ 0.95. If your computer doesn't have the RAM, limit the number of features in the TfidfVectorizer.

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3 )
#unlabel = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

y = train["sentiment"]

print('vectorizing... ',)

tfv = TfidfVectorizer(min_df=100, analyzer=review_to_words)
tfv.fit(train['review']+test['review'])

X = tfv.transform(train['review'])
X_test = tfv.transform(test['review'])

print(len(tfv.get_feature_names()))

model = LogisticRegression(penalty='l2', dual=False, tol=0.0001,
                         C=1, fit_intercept=True, intercept_scaling=1.0, 
                         class_weight=None, random_state=None)
print ("20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=20, scoring='roc_auc')))

print ("Retrain on all training data, predicting test labels...\n")
model.fit(X,y)
result = model.predict(X_test)
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv('LogisticRegression.csv', index=False, quoting=3)
print ("Wrote results to LogisticRegression.csv")

vectorizing... 
4933
20 Fold CV Score:  0.952827392
Retrain on all training data, predicting test labels...

Wrote results to LogisticRegression.csv
