### Spacy Example

In [14]:
# Machine Learning imports
import nltk
import re
import pandas as pd
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

In [15]:
# Loading the dataset
df = pd.read_csv('../data/train_tweets.txt', sep="\t", header=None, names=["id", "tweet"])
df.head(3)

Unnamed: 0,id,tweet
0,8746,@handle Let's try and catch up live next week!
1,8746,Going to watch Grey's on the big screen - Thur...
2,8746,@handle My pleasure Patrick....hope you are well!


In [16]:
user_id = df['id'][df.id == 8746]
print('number of positve tagged sentences is:  {}'.format(len(user_id)))

number of positve tagged sentences is:  91


In [27]:
# get a word count per sentence column
def word_count(sentence):
    return len(sentence.split())
    
df['word_count'] = df['tweet'].apply(word_count)
df.head(3)

# # Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
    # stop_words = set(stopwords.words('english'))
# df['tweet_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

Unnamed: 0,id,tweet,word_count,tokens
0,8746,@ let's try and catch up live next week!,9,"[@, let's, try, catch, live, next, week!]"
1,8746,going to watch grey's on the big screen - thur...,11,"[going, watch, grey's, big, screen, -, thursda..."
2,8746,@ my pleasure patrick....hope you are well!,7,"[@, pleasure, patrick....hope, well!]"


In [20]:
# Pre-processing the raw tweets
stop_words = set(stopwords.words('english'))
def processTweet(tweet):
    # Removing http links from the tweet
    tweet = re.sub(r"http\S+", "", tweet)
    # Removing 'handle' keyword from the tweets
    tweet = re.sub(r"handle", "", tweet)
    # To lowercase
    tweet = tweet.lower()
    # tweet = (lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    return tweet

# clean dataframe's text column
df['tweet'] = df['tweet'].apply(processTweet)
# preview some cleaned tweets
df['tweet'].head()

0             @ let's try and catch up live next week!
1    going to watch grey's on the big screen - thur...
2          @ my pleasure patrick....hope you are well!
3    @ hi there! been traveling a lot and lots more...
4    rt @ looking to drink clean & go green? purcha...
Name: tweet, dtype: object

In [24]:
# most common words in twitter dataset
all_words = []
for line in list(df['tweet']):
    words = line.split()
    for word in words:
        all_words.append(word.lower())
Counter(all_words).most_common(10)

[('@', 141587),
 ('the', 125117),
 ('to', 98094),
 ('a', 77160),
 ('i', 70171),
 ('and', 54093),
 ('for', 52677),
 ('of', 51519),
 ('in', 50792),
 ('is', 45305)]

In [25]:
# tokenize helper function
def text_process(raw_text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
#     # Check characters to see if they are in punctuation
#     nopunc = [char for char in list(raw_text) if char not in string.punctuation]
#     # Join the characters again to form the string.
#     nopunc = ''.join(nopunc)
    raw_text = ''.join(raw_text) 
    
    # Now just remove any stopwords
    return [word for word in raw_text.lower().split() if word.lower() not in stopwords.words('english')]

# def remove_words(word_list):
#     remove = ['paul','ryan','...','“','”','’','…','ryan’']
#     return [w for w in word_list if w not in remove]
# -------------------------------------------
# tokenize message column and create a column for tokens
df = df.copy()
df['tokens'] = df['tweet'].apply(text_process) # tokenize style 1
# df_paulry['no_pauls'] = df_paulry['tokens'].apply(remove_words) #tokenize style 2
df.head()

Unnamed: 0,id,tweet,word_count,tokens
0,8746,@ let's try and catch up live next week!,9,"[@, let's, try, catch, live, next, week!]"
1,8746,going to watch grey's on the big screen - thur...,11,"[going, watch, grey's, big, screen, -, thursda..."
2,8746,@ my pleasure patrick....hope you are well!,7,"[@, pleasure, patrick....hope, well!]"
3,8746,@ hi there! been traveling a lot and lots more...,27,"[@, hi, there!, traveling, lot, lots, come, ne..."
4,8746,rt @ looking to drink clean & go green? purcha...,19,"[rt, @, looking, drink, clean, &, go, green?, ..."


In [28]:
# vectorize
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['tweet'])
# print total number of vocab words
print(len(bow_transformer.vocabulary_))

324183


In [31]:
# # example of vectorized text
# sample_tweet = df['tweet'][123]
# print(sample_tweet)
# print('\n')
# # vector representation
# bow_sample = bow_transformer.transform([sample_tweet])
# print(bow_sample)
# print('\n')

# transform the entire DataFrame of messages
messages_bow = bow_transformer.transform(df['tweet'])
# check out the bag-of-words counts for the entire corpus as a large sparse matrix
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (328195, 324183)
Amount of Non-Zero occurences:  2852457


In [32]:
# from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)

# to transform the entire bag-of-words corpus
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(328195, 324183)


In [1]:
# Run Train Data Through Pipeline analyzer=text_process
# uncomment below to train on a larger dataset but it's very slow for a slower machine.
#X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['id'], test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(df['tweet'][:50000], df['id'][:50000], test_size=0.2)

# create pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(strip_accents='ascii',
                            stop_words='english',
                            lowercase=True)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

# this is where we define the values for GridSearchCV to iterate over
parameters = {'bow__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'classifier__alpha': (1e-2, 1e-3),
             }

# do 10-fold cross validation for each of the 6 possible combinations of the above params
grid = GridSearchCV(pipeline, cv=10, param_grid=parameters, verbose=1)
grid.fit(X_train,y_train)

# summarize results
print("\nBest Model: %f using %s" % (grid.best_score_, grid.best_params_))
print('\n')
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("Mean: %f Stdev:(%f) with: %r" % (mean, stdev, param))

NameError: name 'train_test_split' is not defined

In [34]:
# save best model to current working directory
joblib.dump(grid, "twitter_sentiment.pkl")

# load from file and predict using the best configs found in the CV step
model_NB = joblib.load("twitter_sentiment.pkl" )

# get predictions from best model above
y_preds = model_NB.predict(X_test)
print('accuracy score: ',accuracy_score(y_test, y_preds))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds))
print('\n')
print(classification_report(y_test, y_preds))

accuracy score:  0.443


confusion matrix: 
 [[8 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 6 0]
 [0 0 0 ... 0 0 9]]


              precision    recall  f1-score   support

          28       0.42      0.53      0.47        15
         483       1.00      0.20      0.33         5
         484       0.00      0.00      0.00         1
         564       0.33      0.30      0.32        10
         572       0.88      1.00      0.94        15
         578       1.00      0.40      0.57         5
         638       0.67      0.57      0.62         7
         643       1.00      0.22      0.36         9
         713       0.14      0.50      0.22         2
         766       0.33      0.17      0.22         6
         841       0.75      0.50      0.60         6
         852       0.31      0.36      0.33        14
         982       0.40      0.40      0.40         5
        1017       0.45      0.61      0.52        23
        1116       0.0

  'precision', 'predicted', average, warn_for)
