### A simpple stentiment prototype

Source: https://github.com/williamsmj/sentiment/blob/master/sentiment.ipynb

In [None]:
import os #manipulate paths
import pandas as pd
import joblib

In [None]:
DataDir = 'Data/trainingandtestdata/'
training_csv_file = os.path.join(DataDir, 'testdata.manual.2009.06.14.csv')

In [None]:
# A peek at the data

In [None]:
names = ('polarity', 'id', 'date', 'query', 'author', 'text')
df = pd.read_csv(training_csv_file, encoding='latin1', names=names)

In [None]:
pd.options.display.max_colwidth = 140
df.head()

In [None]:
df.tail()

In [None]:
df['polarity'].replace({0:-1, 4:1}, inplace=True)
text = df['text']
target = df['polarity'].values

In [None]:
print(len(target), len(text))

### Traing the model
Set 20% of the data aside to test the trained model

In [None]:
from sklearn.cross_validation import train_test_split

text_train, text_validation, target_train, target_validation = (
    train_test_split(text, target, test_size=0.2, random_state=42)
)

Build a pipeline

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=100000)
feature_selector = SelectKBest(chi2, k=5000)
classifier = LogisticRegressionCV(n_jobs=4)

In [72]:
if os.path.exists('model.pkl'):
    sentiment_pipeline = joblib.load('model.pkl')
else:
    sentiment_pipeline = Pipeline((
        ('v', vectorizer),
        ('f', feature_selector),
        ('c', classifier)
    ))
    sentiment_pipeline.fit(X=text_train, y=target_train)
    joblib.dump(sentiment_pipeline, 'model.pkl');

TypeError: 'tuple' object does not support item assignment

Sklearn Pipeline Fix for this Bug: https://github.com/scikit-learn/scikit-learn/pull/9221

In [73]:
pd.__version__

'0.20.3'

In [65]:
vectorizer.dtype

numpy.int64

In [66]:
feature_selector.dtype

AttributeError: 'SelectKBest' object has no attribute 'dtype'

### Test the model

In [None]:
print(sentiment_pipeline.predict(['bad', 'good', "didnt like", "today was a good day", "i hate this product"]))

In [None]:
for text, target in zip(text_validation[:10], target_validation[:10]):
    print(sentiment_pipeline.predict([text])[0], target, '\t', text)

In [None]:
sentiment_pipeline.score(text_validation, target_validation)

### What did the model learn?

In [None]:
feature_names = sentiment_pipeline.steps[0][1].get_feature_names()
feature_names = [feature_names[i] for i in 
                 sentiment_pipeline.steps[1][1].get_support(indices=True)]

def show_most_informative_features(feature_names, clf, n=1000):
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [None]:
show_most_informative_features(feature_names, sentiment_pipeline.steps[2][1], n=500)