In [1]:
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import re

def load_data(file_path):
    try:
        csv = pd.read_csv(file_path, encoding='utf-8')
        return csv
    except Exception:
        print(e)
    
# 4000 most common english words: https://github.com/pkLazer/password_rank/blob/master/4000-most-common-english-words-csv.csv
file_path = "./4000-most-common-english-words.csv"
common_words = pd.DataFrame()
common_words = common_words.append(load_data(file_path), ignore_index=True)
common_words = list(common_words['Words'])[:2000]
    
def isEnglish(sent):
    size = len(sent)
    if(size == 0):
        return False
    english = 0.0
    for word in sent:
        if (word in common_words):
            english += 1
    if(english / size >= 0.15):
        return True
    return False

def removeLinks(tweet):
        link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
        res = []
        for w in tweet:
            if not re.match(link_regex, w):
                res.append(w)
        return res
    
'''
Save and load objects to the disk
'''
import pickle
## Save to disk
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f,encoding='latin1')



#### Filtered

In [9]:
print("Reading tweets file...")
file_path = "./twitter-airline-sentiment/Tweets.csv"
tweets = pd.DataFrame()
tweets = tweets.append(load_data(file_path), ignore_index=True)
# Get only useful columns
tweets = tweets[['text','airline_sentiment', 'airline']]
print(len(tweets), "tweets")

print("Removing Retweets and short tweets...")
# Remove retweets and short tweets
tweets["text"] = tweets["text"].astype('str')
mask = ((tweets["text"].str.len() > 20) & ~(tweets["text"].str.contains("RT")))
tweets = tweets.loc[mask]
print(len(tweets), "tweets")

print("Tokenizing...")
t_tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tweets['tokenized'] = tweets['text'].apply(t_tokenizer.tokenize)

print("Removing Punctuations...")
punctuations = list(string.punctuation)
tweets['tokenized'] = tweets['tokenized'].apply(lambda row: [word for word in row if word not in punctuations and word not in ['...', '..']])

print("Removing non English tweets...")
tweets['isEnglish'] = tweets['tokenized'].apply(lambda row: isEnglish(row))
mask = (tweets['isEnglish'] == True)
tweets = tweets.loc[mask]
print(len(tweets), "tweets")

print("Removing links...")
tweets['tokenized'] = tweets['tokenized'].apply(lambda row: removeLinks(row))

print("Removing Stopwords...")
tweets['tokenized'] = tweets['tokenized'].apply( lambda row: [word for word in row if word not in stopwords.words('english')])

print("Creating labels list...")
labels = list(tweets["airline_sentiment"])

train_tokens, test_tokens, train_labels, test_labels = train_test_split(tweets['tokenized'], labels, test_size=0.2, random_state=0)

print("Creating train corpus...")
corpus = []
for sent in train_tokens:
    corpus.append(" ".join(sent))

print("tf-idf Vectorizing corpus...")
vv = TfidfVectorizer(norm = None)
train_features = vv.fit_transform(corpus)

print("Creating test corpus...")
test_corpus = []
for sent in test_tokens:
    test_corpus.append(" ".join(sent))
    
test_features = vv.transform(test_corpus)

print("\nMultinomial Naive Bayes")
mnb = MultinomialNB()
mnb.fit(train_features, train_labels)
pred = mnb.predict(test_features)
f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

print("\nKNeighborsClassifier")
knn = KNeighborsClassifier()
knn.fit(train_features, train_labels)
pred = knn.predict(test_features)
f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

print("\nRandomForestClassifier")
rf =  RandomForestClassifier(random_state=0)
rf.fit(train_features, train_labels)
pred = rf.predict(test_features)
f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

Reading tweets file...
14640 tweets
Removing Retweets and short tweets...
14328 tweets
Tokenizing...
Removing Punctuations...
Removing non English tweets...
14034 tweets
Removing links...
Removing Stopwords...
Creating labels list...
Creating train corpus...
tf-idf Vectorizing corpus...
Creating test corpus...

Multinomial Naive Bayes
F1 score:  0.742073387959

KNeighborsClassifier
F1 score:  0.601353758461

RandomForestClassifier
F1 score:  0.756679729248


#### unfiltered

In [8]:
print("Reading tweets file...")
file_path = "./twitter-airline-sentiment/Tweets.csv"
tweets = pd.DataFrame()
tweets = tweets.append(load_data(file_path), ignore_index=True)
# Get only useful columns
tweets = tweets[['text','airline_sentiment', 'airline']]
print(len(tweets), "tweets")

print("Tokenizing...")
t_tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tweets['tokenized'] = tweets['text'].apply(t_tokenizer.tokenize)

print("Creating labels list...")
labels = list(tweets["airline_sentiment"])

train_tokens, test_tokens, train_labels, test_labels = train_test_split(tweets['tokenized'], labels, test_size=0.2, random_state=0)

print("Creating train corpus...")
corpus = []
for sent in train_tokens:
    corpus.append(" ".join(sent))

print("tf-idf Vectorizing corpus...")
vv = TfidfVectorizer(norm = None)
train_features = vv.fit_transform(corpus)

print("Creating test corpus...")
test_corpus = []
for sent in test_tokens:
    test_corpus.append(" ".join(sent))
    
test_features = vv.transform(test_corpus)

print("\nMultinomial Naive Bayes")
mnb = MultinomialNB()
mnb.fit(train_features, train_labels)
pred = mnb.predict(test_features)
f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

print("\nKNeighborsClassifier")
knn = KNeighborsClassifier()
knn.fit(train_features, train_labels)
pred = knn.predict(test_features)
f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

print("\nRandomForestClassifier")
rf =  RandomForestClassifier(random_state=0)
rf.fit(train_features, train_labels)
pred = rf.predict(test_features)
f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

Reading tweets file...
14640 tweets
Tokenizing...
Creating labels list...
Creating train corpus...
tf-idf Vectorizing corpus...
Creating test corpus...

Multinomial Naive Bayes
F1 score:  0.769467213115

KNeighborsClassifier
F1 score:  0.45662568306

RandomForestClassifier
F1 score:  0.741803278689


### Using sentiment140 Dataset 

In [3]:
print("Reading tweets file...")
file_path = "./sentiment140/training.csv"
tweets = pd.DataFrame()
tweets = tweets.append(load_data(file_path), ignore_index=True)
# Get only useful columns
tweets = tweets[['text','polarity']]
print(len(tweets), "tweets")

print("Removing Retweets and short tweets...")
# Remove retweets and short tweets
tweets["text"] = tweets["text"].astype('str')
mask = ((tweets["text"].str.len() > 20) & ~(tweets["text"].str.contains("RT")))
tweets = tweets.loc[mask]
print(len(tweets), "tweets")

print("Tokenizing...")
t_tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tweets['tokenized'] = tweets['text'].apply(t_tokenizer.tokenize)

print("Removing Punctuations...")
punctuations = list(string.punctuation)
tweets['tokenized'] = tweets['tokenized'].apply(lambda row: [word for word in row if word not in punctuations and word not in ['...', '..']])

print("Removing non English tweets...")
tweets['isEnglish'] = tweets['tokenized'].apply(lambda row: isEnglish(row))
mask = (tweets['isEnglish'] == True)
tweets = tweets.loc[mask]
print(len(tweets), "tweets")

print("Removing links...")
tweets['tokenized'] = tweets['tokenized'].apply(lambda row: removeLinks(row))

print("Removing Stopwords")
tweets['tokenized'] = tweets['tokenized'].apply( lambda row: [word for word in row if word not in stopwords.words('english')])

save_obj(tweets, './tweets140')

Reading tweets file...
1600000 tweets
Removing Retweets and short tweets...
1533936 tweets
Tokenizing...
Removing Punctuations...
Removing non English tweets...
1469498 tweets
Removing links...
Removing Stopwords
                                                text  polarity  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...         0   
1  is upset that he can't update his Facebook by ...         0   
2  @Kenichan I dived many times for the ball. Man...         0   
3    my whole body feels itchy and like its on fire          0   
4  @nationwideclass no, it's not behaving at all....         0   

                                           tokenized  isEnglish  
0  [awww, that's, bummer, shoulda, got, david, ca...       True  
1  [upset, can't, update, facebook, texting, migh...       True  
2  [dived, many, times, ball, managed, save, 50, ...       True  
3            [whole, body, feels, itchy, like, fire]       True  
4                   [behaving, i'm, mad, can't, see]       T

In [2]:
tweets = load_obj('./tweets140')
print(tweets.head())

                                                text  polarity  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...         0   
1  is upset that he can't update his Facebook by ...         0   
2  @Kenichan I dived many times for the ball. Man...         0   
3    my whole body feels itchy and like its on fire          0   
4  @nationwideclass no, it's not behaving at all....         0   

                                           tokenized  isEnglish  
0  [awww, that's, bummer, shoulda, got, david, ca...       True  
1  [upset, can't, update, facebook, texting, migh...       True  
2  [dived, many, times, ball, managed, save, 50, ...       True  
3            [whole, body, feels, itchy, like, fire]       True  
4                   [behaving, i'm, mad, can't, see]       True  


In [3]:
print("Creating labels list...")
labels = list(tweets["polarity"])

train_tokens, test_tokens, train_labels, test_labels = train_test_split(tweets['tokenized'], labels, test_size=0.2, random_state=0)

print("Creating train corpus...")
corpus = []
for sent in train_tokens:
    corpus.append(" ".join(sent))

print("tf-idf Vectorizing corpus...")
vv = TfidfVectorizer(norm = None)
train_features = vv.fit_transform(corpus)

print("Creating test corpus...")
test_corpus = []
for sent in test_tokens:
    test_corpus.append(" ".join(sent))
    
test_features = vv.transform(test_corpus)

Creating labels list...
Creating train corpus...
tf-idf Vectorizing corpus...
Creating test corpus...


In [4]:
mnb = MultinomialNB()
mnb.fit(train_features, train_labels)
pred = mnb.predict(test_features)

f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

F1 score:  0.746430758761


In [None]:
knn = KNeighborsClassifier()
knn.fit(train_features, train_labels)
pred = knn.predict(test_features)

f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)

In [None]:
rf =  RandomForestClassifier(random_state=0)
rf.fit(train_features, train_labels)
pred = rf.predict(test_features)

f1_score = metrics.f1_score(y_pred=pred, y_true=test_labels, average='micro')
print("F1 score: ", f1_score)