In [1]:
# use natural language toolkit
import nltk
from nltk.stem.lancaster import LancasterStemmer
import os
import json
import datetime
import re
import csv
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score
import nltk.classify.util
import pandas as pd
from pattern.en import sentiment


from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.cross_validation import train_test_split 

stemmer = LancasterStemmer()



In [2]:
#initialize stopWords
stopWords = []

def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

def getStopWordList(stopWordListFileName):
    #read the stopwords file and build a list
    stopWords = []
    stopWords.append('USER')
    stopWords.append('URL')

    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords

def processTweet(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #print ("pprocesstweet"," ", tweet)
    #trim
    tweet = tweet.strip('\'"')
    
    return tweet

def getFeatureVector(tweet):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector

def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

stopWords = getStopWordList('stopwords.txt')

In [3]:
xl = pd.ExcelFile('aws_tweets_data_cleaned.xls')
inpTweets = xl.parse("aws_tweets_data")

featureList = []
features = inpTweets.iloc[:,0:]
# Get tweet words
training_data = []
for row in (features.values):
    tweet = row[0]
    sentiment = row[1]
    training_data.append({"class":sentiment, "sentence":tweet})
print ("%s sentences in training data" % len(training_data))

1781 sentences in training data


In [4]:
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our training data
for pattern in training_data:
    
    tweet = pattern['sentence']
    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Remove numbers
    tweet = re.sub('[\d]+', ' ', tweet)
    #trim
    tweet = tweet.strip('\'"')
    
    
    # tokenize each word in the sentence
    tokens = nltk.word_tokenize(tweet)
    tok = []
    
    for w in tokens:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            tok.append(w)
     
    #add to our words list
    words.extend(tok)
    # add to documents in our corpus
    documents.append((tok, pattern['class']))
    # add to our classes list
    if pattern['class'] not in classes:
        classes.append(pattern['class'])

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = list(set(words))

# remove duplicates
classes = list(set(classes))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words[:10])


1781 documents
3 classes [u'positive', u'neutral', u'negative']
1438 unique stemmed words [u'demand', u'frontp', u'symantecvid', u'spel', u'swap', u'sorry', u'spec', u'digit', u'sapphirenow', u'every']


In [5]:
# append input data specific adjectives
positives = pd.read_table("positives.txt")
print (positives.shape)
negatives = pd.read_table("negatives.txt")
print (negatives.shape)

print (type(positives))
doc=[]
for i in range(1,len(positives)):
    doc.append((positives.iloc[i][0],"positive"))

for i in range(1,len(negatives)):
    doc.append((negatives.iloc[i][0],"negative"))

print (doc[:10])

(92, 1)
(16, 1)
<class 'pandas.core.frame.DataFrame'>
[('wow', 'positive'), ('new', 'positive'), ('announce', 'positive'), ('annoncing', 'positive'), ('hugs', 'positive'), ('excited', 'positive'), ('supercharge', 'positive'), ('strengthen', 'positive'), ('great', 'positive'), ('good', 'positive')]


In [6]:
# adding positive and negative keywords
documents.extend(doc)
words.extend(positives)

print (len(documents), "documents")
print (len(words), "words")
print (len(positives), "positives")
words[-5:]

1887 documents
1439 words
92 positives


[u'internetofth', u'emerg', u'cws', u'sysct', 'acquires']

In [7]:
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)



In [8]:

# sample training/output
i = 5
w = documents[i][0]
print ([stemmer.stem(word.lower()) for word in w])
print (training[i])
print (output[i])

[u'azurehelp', u'ie', u'fail', u'javascrib', u'mvc', u'az', u'deploy', u'url']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
# to strip u character
STRIPUNICODEChar=0
if STRIPUNICODEChar:
    my_list = documents
    upddocs=[]

    for tokens in my_list:
        asciitokens = []
        if (len(tokens)> 1):
            for tok in iter(tokens[0]):
                asciitoks = tok.encode('ascii')
                asciitokens.append(asciitoks)
        else:
            asciitoks = token.encode('ascii')
            asciitokens.append(asciitoks)

        upddocs.append((asciitokens, [tokens[1].encode('ascii')]))

    print (len(upddocs))
    documents = upddocs

In [10]:
#trial purpose only
tdata = (np.array([x[0] for x in documents]))
target = (np.array([x[1] for x in documents]))

X_train ,x_test, Y_train, y_test = train_test_split(tdata,target,test_size=0.2)
print (type(X_train))
print (X_train.shape)
print (documents[:10])
print (tdata)
print (tdata.shape)

<type 'numpy.ndarray'>
(1509L,)
[([u'microsoft', u'azure', u'kajaani', u'lecture', u'create', u'datacenters', u'url'], u'neutral'), ([u'sample', u'may', u'th', u'delete', u'row', u'windows', u'azure', u'table', u'storage', u'without', u'retrieving', u'first', u'url', u'microsoft'], u'positive'), ([u'anybody', u'know', u'much', u'microsoft', u'azure', u'cloud', u'computing', u'cloudserver'], u'positive'), ([u'rt', u'put', u'strategy', u'internet', u'things', u'place', u'microsoft', u'azure', u'intelligent', u'system', u'service', u'url'], u'positive'), ([u'looks', u'like', u'excellent', u'sessions', u'allot', u'azure', u'devops', u'goodness', u'coming'], u'positive'), ([u'azurehelp', u'ie', u'fails', u'javascripts', u'mvc', u'azure', u'deployment', u'url'], u'negative'), ([u'moving', u'hybrid', u'cloud', u'microsoft', u'azure', u'url'], u'neutral'), ([u'update', u'microsoft', u'powershell', u'ise', u'script', u'browser', u'script', u'analyzer', u'url', u'available', u'sysctr', u'azure',

In [11]:
df = pd.DataFrame(documents, columns=('text','class'))
print (df.shape)
tfidf_vect= TfidfVectorizer(  use_idf=True, smooth_idf=True, sublinear_tf=False)

type(df['text'])
#X = tfidf_vect.fit_transform(str(x) for x in df['text'].values)
X = tfidf_vect.fit_transform(raw_documents=(str(x) for x in df['text'].values))
y = df['class'].values
print (tfidf_vect)
print(df['text'].values)

#feature_names = tfidf_vect.get_featuree_names()
#print(feature_names)

#data_mat=pd.DataFrame(dense,columns=feature_names)
#print(data_mat)

(1887, 2)
TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
[ [u'microsoft', u'azure', u'kajaani', u'lecture', u'create', u'datacenters', u'url']
 [u'sample', u'may', u'th', u'delete', u'row', u'windows', u'azure', u'table', u'storage', u'without', u'retrieving', u'first', u'url', u'microsoft']
 [u'anybody', u'know', u'much', u'microsoft', u'azure', u'cloud', u'computing', u'cloudserver']
 ..., 'locks' 'losing' 'issues']


In [12]:
df = pd.DataFrame(documents, columns=('text','class'))
print (df.shape)
tfidf_vect= TfidfVectorizer(  use_idf=True, smooth_idf=True, sublinear_tf=False)

type(df['text'])

X = tfidf_vect.fit_transform(str(x) for x in df['text'].values)
y = df['class'].values


a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(a_train.shape)
print (b_train.shape)
print (a_test.shape)
#print (b_test.shape)
print (type(a_train))
rf_classifier = RandomForestClassifier(n_estimators=100) #, max_features=69)
rf_classifier = rf_classifier.fit(a_train.toarray(), b_train)

(1887, 2)
(1264, 1860)
(1264L,)
(623, 1860)
<class 'scipy.sparse.csr.csr_matrix'>


In [29]:
from sklearn.naive_bayes import MultinomialNB
clf_train = MultinomialNB().fit(a_train.toarray(), b_train)
pred_nb=clf_train.predict(a_test.toarray())
prediction_nb=confusion_matrix(b_test,pred_nb)

In [30]:
#import sklearn.metrics as metrics
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
#print ('\nscore:', classifier.score(a_train, b_test))

print ('\naccuracy:', accuracy_score(b_test, pred_nb))
print ('\nprecision:', precision_score(b_test, pred_nb, average='weighted'))
print ('\nrecall:', recall_score(b_test, pred_nb, average = 'weighted'))
print ('\n confussion matrix:\n',confusion_matrix(b_test, pred_nb))
print ('\n clasification report:\n', classification_report(b_test, pred_nb))


accuracy: 0.84430176565

precision: 0.856003739962

recall: 0.84430176565

 confussion matrix:
 [[ 21  24  11]
 [  0 294  19]
 [  0  43 211]]

 clasification report:
              precision    recall  f1-score   support

   negative       1.00      0.38      0.55        56
    neutral       0.81      0.94      0.87       313
   positive       0.88      0.83      0.85       254

avg / total       0.86      0.84      0.83       623



In [28]:

prediction = rf_classifier.predict(a_test.toarray()) 

#import sklearn.metrics as metrics
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
#print ('\nscore:', classifier.score(a_train, b_test))

print ('\naccuracy:', accuracy_score(b_test, prediction))
print ('\nprecision:', precision_score(b_test, prediction, average='weighted'))
print ('\nrecall:', recall_score(b_test, prediction, average = 'weighted'))
print ('\n confussion matrix:\n',confusion_matrix(b_test, prediction))
print ('\n clasification report:\n', classification_report(b_test, prediction))


accuracy: 0.898876404494

precision: 0.90046609342

recall: 0.898876404494

 confussion matrix:
 [[ 37   8  11]
 [  0 299  14]
 [  2  28 224]]

 clasification report:
              precision    recall  f1-score   support

   negative       0.95      0.66      0.78        56
    neutral       0.89      0.96      0.92       313
   positive       0.90      0.88      0.89       254

avg / total       0.90      0.90      0.90       623



In [36]:
# Test the classifier
testTweet = ["@Microsoft has case study on Azure with insofe","Insofe reported issues with Azure", 
             "Azure has several issues", "Azure is losing ground with AWS", 
             "Azure training session is being conducted in Hyderabad"]

testlabels = ['positive', 'negative', 'negative', 'negative', 'neutral']
predicted=[]
print (type(testTweet))
print (len(testTweet))
    
df = pd.DataFrame(testTweet, columns=['text']) #[x for x in iter(testTweet)]
#print (type (features))

# Get tweet words
tweets = []
for i in range(len(df)):
    tweet = df['text'][i]
    processedTweet = processTweet(tweet)
    #print("processedTweet",processedTweet)
    featureVector = getFeatureVector(processedTweet)
    #print ("featureVector", featureVector)
    tweets.append((featureVector));
#end loop

print ("tweets", tweets)


X = tfidf_vect.transform([str(x) for x in tweets])
predicted = rf_classifier.predict(X.toarray())


print ("predicted values ", predicted)
print ("testlables are ", testlabels)

<type 'list'>
5
tweets [['case', 'study', 'azure', 'insofe'], ['insofe', 'reported', 'issues', 'azure'], ['azure', 'several', 'issues'], ['azure', 'losing', 'ground', 'aws'], ['azure', 'training', 'session', 'conducted', 'hyderabad']]
predicted values  [u'neutral' u'neutral' u'neutral' u'neutral' u'positive']
testlables are  ['positive', 'negative', 'negative', 'negative', 'neutral']


In [37]:
#import sklearn.metrics as metrics
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report

print("on some random text")
print ('\naccuracy:', accuracy_score(testlabels, predicted))
print ('\nprecision:', precision_score(testlabels, predicted, average='weighted'))
print ('\nrecall:', recall_score(testlabels, predicted, average = 'weighted'))
print ('\n confussion matrix:\n',confusion_matrix(testlabels, predicted))
print ('\n clasification report:\n', classification_report(testlabels, predicted))

on some random text

accuracy: 0.0

precision: 0.0

recall: 0.0

 confussion matrix:
 [[0 3 0]
 [0 0 1]
 [0 1 0]]

 clasification report:
              precision    recall  f1-score   support

   negative       0.00      0.00      0.00         3
    neutral       0.00      0.00      0.00         1
   positive       0.00      0.00      0.00         1

avg / total       0.00      0.00      0.00         5

