In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation
from keras.utils import np_utils

Using Theano backend.


In [2]:
from textblob import Word

In [3]:
import pandas as pd
import numpy as np
import pickle

In [4]:
with open('../Data/ans_clean_text.pickle', 'rb') as handle:
  ans_clean_text = pickle.load(handle)

In [23]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             min_df = 0.1,
                             max_features = 8000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
data_features = vectorizer.fit_transform(ans_clean_text)

# Numpy arrays are easy to work with, so convert the result to an 
# array
data_features = data_features.toarray()

In [24]:
print data_features.shape
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab

(74331, 5000)


In [12]:
with open('../Data/ans_quality.pickle', 'rb') as handle:
  ans_quality = pickle.load(handle)

In [15]:
ans_quality.head()

0    awesome
1       good
2    awesome
3    awesome
4       good
Name: AnsQuality, dtype: category
Categories (5, object): [bad < neutral < satisfactory < good < awesome]

In [16]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(ans_quality)
encoded_Y = encoder.transform(ans_quality)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [20]:
dummy_y.shape

(74331, 5)

In [25]:
seed = 1234
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(data_features, ans_quality,
    test_size=0.2, random_state=seed)

In [28]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

Training the random forest...


In [29]:
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit(X_train, Y_train)

In [31]:
# Use the random forest to make sentiment label predictions
predictions = forest.predict(X_validation)

In [30]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [32]:
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.472388511468
[[   3    0  204    3  123]
 [   0    0   26    2  150]
 [   2    0 2099   10 3045]
 [   0    0  402   19 2298]
 [   0    0 1550   29 4902]]
             precision    recall  f1-score   support

    awesome       0.60      0.01      0.02       333
        bad       0.00      0.00      0.00       178
       good       0.49      0.41      0.44      5156
    neutral       0.30      0.01      0.01      2719
satisfactory       0.47      0.76      0.58      6481

avg / total       0.44      0.47      0.41     14867



  'precision', 'predicted', average, warn_for)


In [33]:
predictions

array(['satisfactory', 'satisfactory', 'satisfactory', ..., 'satisfactory',
       'satisfactory', 'satisfactory'], dtype=object)

In [37]:
from collections import Counter
Counter(list(predictions))

Counter({'satisfactory': 10518, 'good': 4281, 'neutral': 63, 'awesome': 5})

In [38]:
Counter(list(Y_validation))

Counter({'satisfactory': 6481, 'good': 5156, 'neutral': 2719, 'awesome': 333, 'bad': 178})