## The following 4 cells include function definitions needed for tweet preprocessing

In [1]:
import re
#start process_tweet
def processTweet(tweet):
    # process the tweets
    
    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)    
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet

#end 

In [2]:
#start getStopWordList
def getStopWordList(stopWordListFileName):
    #read the stopwords
    stopWords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')

    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords
#end

In [3]:
#start replaceTwoOrMore
def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL) 
    return pattern.sub(r"\1\1", s)
#end

In [4]:
def getFeatureVector(tweet, stopWords):
    featureVector = []  
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences 
        w = replaceTwoOrMore(w) 
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if it consists of only words
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
        #ignore if it is a stopWord
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector    


#end

## We're importing a labeled dataset to train the classifier.

In [5]:
import pandas as pd
import numpy as np

# This piece of the code imports a labeled dataset to be used to train the classifier. 

filepath = '/Users/ausubo/karma/insight/project/twitter/sentiment_topics_twitter/data/debate08_sentiment_tweets.csv'
debate_tweets = pd.read_csv(filepath)
#Eliminating tweets classified as neutral or irrelevant.

debate_tweets = debate_tweets[debate_tweets['rating.1'] != 4]
debate_tweets = debate_tweets[debate_tweets['rating.1'] != 3]

training_tweets = debate_tweets

In [6]:
training_tweets.head()

Unnamed: 0,tweet.id,pub.date.GMT,content,author.name,author.nickname,rating.1,rating.2,rating.3,rating.4,rating.5,rating.6,rating.7,rating.8
0,936469851,9/27/08 1:01,Watching by myself #tweetdebate Not drinking ...,drgilpin,Dawn Gilpin,1,1,4,,,,,
1,936470432,9/27/08 1:02,"@ahg3 @MichDot Yeah, slime was actually my sec...",starweaver,,1,1,1,,,,,
2,936472030,9/27/08 1:03,Preparing to have a heart attack #tweetdebate,kyeung808,Ken Yeung,1,1,1,1.0,,,,
3,936472042,9/27/08 1:03,"no debate moderators under 50, sorry #tweetde...",rebot,,1,1,4,1.0,,,,
4,936472907,9/27/08 1:03,@current Now staring at black screen on http:/...,Karoli,Karoli,1,4,1,,,,,


In [7]:
len(training_tweets)

2208

In [8]:
training_tweets.head()

Unnamed: 0,tweet.id,pub.date.GMT,content,author.name,author.nickname,rating.1,rating.2,rating.3,rating.4,rating.5,rating.6,rating.7,rating.8
0,936469851,9/27/08 1:01,Watching by myself #tweetdebate Not drinking ...,drgilpin,Dawn Gilpin,1,1,4,,,,,
1,936470432,9/27/08 1:02,"@ahg3 @MichDot Yeah, slime was actually my sec...",starweaver,,1,1,1,,,,,
2,936472030,9/27/08 1:03,Preparing to have a heart attack #tweetdebate,kyeung808,Ken Yeung,1,1,1,1.0,,,,
3,936472042,9/27/08 1:03,"no debate moderators under 50, sorry #tweetde...",rebot,,1,1,4,1.0,,,,
4,936472907,9/27/08 1:03,@current Now staring at black screen on http:/...,Karoli,Karoli,1,4,1,,,,,


And we're splitting the dataset into 2 pieces to use the labeled data to train & test the classifier model.

In [9]:
msk = np.random.rand(len(training_tweets)) < 0.7
train = training_tweets[msk]
test = training_tweets[~msk]

In [10]:
from nltk.corpus import stopwords

featureList = []
tweets = [] 

stop_words = ['RT']
#stopWords = getStopWordList('data/feature_list/stopwords.txt') + stop_words
stopWords = list(str(stopwords.words('english'))) + stop_words
# check this ^

for row in train.itertuples():
    sentiment = row[6]
    tweet = row[3]
    processedTweet = processTweet(tweet)
    featureVector = getFeatureVector(processedTweet, stopWords)
    featureList.extend(featureVector)
    tweets.append((featureVector, sentiment));
    
#tweets is only the train labeled tweets
print 'Length of labeled dataset used to train the model:', len(tweets)
print 
print 'Length of the complete labeled dataset:', len(training_tweets)

Length of labeled dataset used to train the model: 1558

Length of the complete labeled dataset: 2208


In [11]:
#start extract_features
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features
#end

In [12]:
len(featureList)

19156

In [13]:
# Remove featureList duplicates
featureList = list(set(featureList))

len(featureList)

3220

Here is the documentation for nltk.classify.util.apply_features()

* http://www.nltk.org/_modules/nltk/classify/util.html#apply_features

In [None]:
import nltk
# Generate the training set
training_set = nltk.classify.util.apply_features(extract_features, tweets)

In [None]:
# Train the Naive Bayes classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
print NBClassifier.show_most_informative_features(10)

## Testing the classifier.

In [None]:
# Test the classifier
testTweet = 'Hate'
processedTestTweet = processTweet(testTweet)
sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet, stopWords)))
print "testTweet = %s, sentiment = %s\n" % (testTweet, sentiment)

Now that the classifier has been trained with the "train" portion of the labeled data, let's check how it does.

In [None]:
count = 0;

#featureList_test = []

tweets_v_test = [] 
tweets_test = []

for row in test.itertuples():
    tweet = row[3]
    processedTweet = processTweet(tweet)
    sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTweet, stopWords)))
    tweets_v_test.append((tweet, sentiment))

for row in test.itertuples():
    sentiment = row[6]
    tweet = row[3]
    tweets_test.append((tweet, sentiment))

In [None]:
df_v = pd.DataFrame(tweets_v_test)

df = pd.DataFrame(tweets_test)

In [None]:
def add_classes(df,colname):
    classdict = {'positive': 1, 'negative': 0}
    classes = [classdict[label] for label in df[colname]]
    df['class'] = classes
    return df

def add_classes2(df, colname):
    classdict = { 1: 0, 2: 1}
    classes = [classdict[label] for label in df[colname]]
    df['class'] = classes
    return df

In [None]:
df_v= add_classes2(df_v,1)
df = add_classes2(df,1)

In [None]:
import sklearn
print 'accuracy score is:', sklearn.metrics.accuracy_score(df['class'], df_v['class'], normalize=True, sample_weight=None)
print
print 'precision score is:', sklearn.metrics.precision_score(df['class'], df_v['class'])
print
print 'recall score is:', sklearn.metrics.recall_score(df['class'], df_v['class'])

In [None]:
precision = sklearn.metrics.precision_score(df['class'], df_v['class'])
recall = sklearn.metrics.recall_score(df['class'], df_v['class'])

F1 = 2 * (precision * recall) / (precision + recall)

print F1

In [None]:
import nltk
from sklearn import cross_validation
from sklearn import metrics 
#training_set = nltk.classify.apply_features(extract_features, documents)
training_set = nltk.classify.util.apply_features(extract_features, tweets)

cv = cross_validation.KFold(len(training_set), n_folds=5, shuffle=True, random_state=None)

In [None]:
accuracy = []
for trainidx, testidx in cv:
    traincurr = [training_set[idx] for idx in trainidx]
    testcurr = [training_set[idx] for idx in testidx]
    classifier = nltk.NaiveBayesClassifier.train(traincurr)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testcurr)
    accuracy.append(nltk.classify.util.accuracy(classifier, testcurr))

In [None]:
np.mean(accuracy)

In [None]:
#sander_tweets = pd.read_csv('data/full-corpus.csv')
%matplotlib inline
#sander_tweets['Sentiment'].unique()
training_tweets['rating.1'].value_counts()
#sander_tweets['Sentiment'].hist()

In [None]:
#import csv 
#import re

#Read the tweets one by one and process it

#inpTweets = csv.reader(open('data/full-corpus.csv', 'rb'), delimiter=',', quotechar='|')
#inpTweets = csv.reader(open('data/AFINN-111.csv', 'rb'))
#inpTweets = csv.reader(open('data/full-corpus.csv', 'rb'))
#inpTweets = csv.reader(open('data/debate08_sentiment_tweets.csv','rb'))

#inpTweets = csv.reader(open('data/train_tweets.csv','rU'),delimiter = ',',dialect='excel')
#inpTweets = csv.reader(open('data/train_tweets.csv','rb'))

#stopWords = getStopWordList('data/feature_list/stopwords.txt')

#count = 0;
#featureList = []
#tweets = [] 

#for row in inpTweets:
    
#    try:
#        sentiment = row[6]
#        tweet = row[3]
#        processedTweet = processTweet(tweet)
#        featureVector = getFeatureVector(processedTweet, stopWords)
#        featureList.extend(featureVector)
#        tweets.append((featureVector, sentiment));
#    except:
#        count +=1
        
#end loop


In [None]:
#tweets_v_test2 = []
#tweets_test2 =[]

#for row in test2.itertuples():
#    tweet3 = row[3]
#    processedTweet = processTweet(tweet3)
#    sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTweet, stopWords)))
#    tweets_v_test2.append((tweet3, sentiment))

#for row in test2.itertuples():
#    sentiment3 = row[6]
#    tweet3 = row[3]
#    tweets_test2.append((tweet3, sentiment3))

In [None]:
#tweets_v_test2 = []
#tweets_test2 =[]

#for row in test2.itertuples():
#    tweet3 = row[3]
#    processedTweet = processTweet(tweet3)
#    sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTweet, stopWords)))
#    tweets_v_test2.append((tweet3, sentiment))

#for row in test2.itertuples():
#    sentiment3 = row[6]
#    tweet3 = row[3]
#    tweets_test2.append((tweet3, sentiment3))

In [None]:
#accuracy = []
#for traincv, testcv in cv:
#    classifier = nltk.NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv)-1]])
#    print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]])
#    accuracy.append(nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]]))