In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection

In [2]:
raw = pd.read_csv("data/tweets.csv")

In [3]:
raw.head(5)

Unnamed: 0,Topic,Sentiment,TweetId,TweetDate,TweetText
0,apple,positive,126415614616154112,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,126402758403305474,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
2,apple,positive,126397179614068736,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
3,apple,positive,126379685453119488,Tue Oct 18 19:30:39 +0000 2011,The 16 strangest things Siri has said so far. ...
4,apple,positive,126377656416612353,Tue Oct 18 19:22:35 +0000 2011,Great up close & personal event @Apple tonight...


## TFIDF + MNB (using movie review data as trainning data)

In [4]:
# split data to training set and testing set
def review_series_to_list(review_series):
    review_list=[]
    n_review = len(review_series)
    for i in range(0,n_review):
        review_list.append(review_series[i])
    return review_list  

train_review_list = review_series_to_list(raw['TweetText'])

X_train, X_test, y_train, y_test = train_test_split(
    train_review_list, raw['Sentiment'], test_size=0.33, random_state=42)

In [5]:
nb_model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

nb_fit = nb_model.fit(X_train, y_train)

In [6]:
nb_model_bow = Pipeline([('vect', CountVectorizer()),
                    
                     ('clf', MultinomialNB()),
])

nb_fit_bow = nb_model_bow.fit(X_train, y_train)

In [7]:
nb_predicted2 = nb_model_bow.predict(X_test)
nb_accuracy2 = np.mean(nb_predicted2 == y_test) 

In [8]:
###Prediction and evaluation
nb_predicted = nb_model.predict(X_test)
nb_accuracy = np.mean(nb_predicted == y_test) 
print (nb_accuracy)
print(metrics.classification_report(y_test, nb_predicted)) 
metrics.confusion_matrix(y_test, nb_predicted)


0.697032436163
             precision    recall  f1-score   support

 irrelevant       0.98      0.77      0.86       460
   negative       0.00      0.00      0.00       184
    neutral       0.60      0.99      0.75       661
   positive       0.00      0.00      0.00       144

avg / total       0.59      0.70      0.62      1449



  'precision', 'predicted', average, warn_for)


array([[354,   0, 106,   0],
       [  0,   0, 184,   0],
       [  5,   0, 656,   0],
       [  1,   0, 143,   0]])

## TFIDF + logistics (using movie review data as trainning data)

In [9]:
from sklearn.linear_model import LogisticRegression as LR
lr_model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LR()),
])

lr_fit = lr_model.fit(X_train, y_train)


In [10]:
lr_predicted = lr_model.predict(X_test)
lr_accuracy = np.mean(lr_predicted == y_test) 
print (lr_accuracy)
print(metrics.classification_report(y_test, lr_predicted))
metrics.confusion_matrix(y_test, lr_predicted)

0.730158730159
             precision    recall  f1-score   support

 irrelevant       0.94      0.81      0.87       460
   negative       0.73      0.19      0.30       184
    neutral       0.64      0.96      0.77       661
   positive       0.88      0.10      0.19       144

avg / total       0.77      0.73      0.68      1449



array([[371,   2,  87,   0],
       [  5,  35, 144,   0],
       [ 13,   9, 637,   2],
       [  4,   2, 123,  15]])

In [11]:
lrl2_model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LR(penalty = 'l2', dual = True, random_state = 0)),
])

lrl2_fit = lrl2_model.fit(X_train, y_train)


In [12]:
lrl2_predicted = lrl2_model.predict(X_test)
lrl2_accuracy = np.mean(lrl2_predicted == y_test) 
print (lrl2_accuracy)
print(metrics.classification_report(y_test, lr_predicted))
metrics.confusion_matrix(y_test, lr_predicted)

0.730158730159
             precision    recall  f1-score   support

 irrelevant       0.94      0.81      0.87       460
   negative       0.73      0.19      0.30       184
    neutral       0.64      0.96      0.77       661
   positive       0.88      0.10      0.19       144

avg / total       0.77      0.73      0.68      1449



array([[371,   2,  87,   0],
       [  5,  35, 144,   0],
       [ 13,   9, 637,   2],
       [  4,   2, 123,  15]])

In [13]:
scores5 = model_selection.cross_val_score(lrl2_fit, X_train, y_train, cv=5)
scores5   

array([ 0.73728814,  0.73174873,  0.71938776,  0.75298126,  0.76109215])

### Grid search 

In [14]:
grid_values = {'C':[30]}

In [15]:
from sklearn.grid_search import GridSearchCV
model_LR = GridSearchCV(LR(penalty = 'l2', dual = True, random_state = 0), 
                        grid_values, scoring = 'roc_auc', cv = 20) 



## BOF + MNB

In [16]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) #revew_text 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words ))   

In [17]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords # Import the stop word list
clean_review = review_to_words(raw["TweetText"][0] )



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [19]:
# Get the number of reviews based on the dataframe column size
num_reviews = raw["TweetText"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( review_to_words( raw["TweetText"][i] ) )



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [20]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

In [21]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)

In [22]:
train_data_features = train_data_features.toarray()

In [23]:
vocab = vectorizer.get_feature_names()

In [24]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
from sklearn.model_selection import cross_val_score
NB_bow = MultinomialNB()
NB_bow_fit = NB_bow.fit( train_data_features, raw["Sentiment"] )
scores5 = cross_val_score(NB_bow_fit, train_data_features, raw["Sentiment"], cv=5)
scores5   

array([ 0.42775882,  0.65642776,  0.71070615,  0.70125428,  0.67465753])

In [26]:
scores10 = cross_val_score(NB_bow_fit, train_data_features, raw["Sentiment"], cv=10)
scores10

array([ 0.54090909,  0.50681818,  0.71590909,  0.78181818,  0.69020501,
        0.74715262,  0.68792711,  0.73972603,  0.69107551,  0.75286041])

In [27]:
print("Accuracy 5cv : %0.2f (+/- %0.2f)" % (scores5.mean(), scores5.std() * 2))
print("Accuracy 10cv : %0.2f (+/- %0.2f)" % (scores10.mean(), scores10.std() * 2))

Accuracy 5cv : 0.63 (+/- 0.21)
Accuracy 10cv : 0.69 (+/- 0.17)


## BOF + Logistics

In [28]:
LR_bow_fit = LR()
LR_bow_fit = LR_bow_fit.fit( train_data_features, raw["Sentiment"] )
scores5 = cross_val_score(LR_bow_fit, train_data_features, raw["Sentiment"], cv=5)
scores5   

array([ 0.47554039,  0.67007964,  0.69362187,  0.69897377,  0.6826484 ])

In [29]:
scores10 = cross_val_score(LR_bow_fit, train_data_features, raw["Sentiment"], cv=10)
scores10

array([ 0.60681818,  0.58636364,  0.74090909,  0.71818182,  0.67653759,
        0.74943052,  0.71070615,  0.7283105 ,  0.73913043,  0.75057208])

In [30]:
print("Accuracy 5cv : %0.2f (+/- %0.2f)" % (scores5.mean(), scores5.std() * 2))
print("Accuracy 10cv : %0.2f (+/- %0.2f)" % (scores10.mean(), scores10.std() * 2))

Accuracy 5cv : 0.64 (+/- 0.17)
Accuracy 10cv : 0.70 (+/- 0.11)
