In [15]:
import nltk
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from  nltk.stem import SnowballStemmer
import pandas as pd
import re, string
from nltk import classify
from nltk import NaiveBayesClassifier
# nltk.download('twitter_samples')

In [None]:
# Normalize the words
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

In [2]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

In [3]:
df = pd.read_csv('training_tweets/training_tweets.csv', encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

In [6]:
decode_map = {0: 'NEGATIVE', 2: 'NEUTRAL', 4: 'POSITIVE'}
def decode_sentiment(label):
    return decode_map[int(label)]
df.target = df.target.apply(lambda x: decode_sentiment(x))

In [7]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [8]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', str(text).lower()).strip()
    token = re.sub("(@[A-Za-z0-9_]+)","", str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)
df.text = df.text.apply(lambda x: preprocess(x))

In [9]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [10]:
pos_df = []
neg_df = []
pos_df = df[df['target'] == 'POSITIVE']
neg_df = df[df['target'] == 'NEGATIVE']

In [11]:
pos_tok = []
for i in range(0, len(pos_df)):
    pos_tok.append(remove_noise(word_tokenize(pos_df['text'].iloc[i]), stop_words))
neg_tok = []
for i in range(0, len(neg_df)):
    neg_tok.append(remove_noise(word_tokenize(neg_df['text'].iloc[i]), stop_words))

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(pos_tok)
negative_tokens_for_model = get_tweets_for_model(neg_tok)

In [63]:
positive_tokens_for_model = get_tweets_for_model(pos_tok)
negative_tokens_for_model = get_tweets_for_model(neg_tok)

In [61]:
dataset

{'POSITIVE': '@coldkash',
 'NEGATIVE': '@joshgeeksix maybe sci fi scoop up? always great shows get canceled.'}

In [49]:
neg_tok = []
for i in range(0, len(neg_df)):
    neg_tok.append(remove_noise(word_tokenize(neg_df['text'].iloc[i]), stop_words))
negative_tokens_for_model = get_tweets_for_model(neg_tok)

In [64]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset

In [65]:
len(dataset)

1600000

In [66]:
import random
random.shuffle(dataset)
train_data = dataset[:1280000]
test_data = dataset[1280000:]

In [67]:
len(test_data)

320000

In [68]:
classifier = NaiveBayesClassifier.train(train_data)

In [71]:
import pickle
f = open('classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [69]:
classify.accuracy(classifier, test_data)

0.7610125

In [70]:
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

Accuracy is: 0.7610125
Most Informative Features
                 sadface = True           Negati : Positi =     56.4 : 1.0
                 me..its = True           Positi : Negati =     53.7 : 1.0
              banksyart2 = True           Positi : Negati =     50.9 : 1.0
                     447 = True           Negati : Positi =     47.1 : 1.0
                  farrah = True           Negati : Positi =     44.7 : 1.0
                dividend = True           Positi : Negati =     44.3 : 1.0
                  triste = True           Negati : Positi =     37.0 : 1.0
                 *cries* = True           Negati : Positi =     35.0 : 1.0
                     os3 = True           Negati : Positi =     33.7 : 1.0
                 saddens = True           Negati : Positi =     33.0 : 1.0
None


In [75]:
custom_tweet = "he is a racist"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative
