In [0]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.models import Sequential
from keras.utils.np_utils import to_categorical
from keras.layers import Dense
from keras.optimizers import Adam

In [33]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
tweets = pd.read_csv("mydata.csv",header = None)

In [0]:
documents = np.array(tweets)

In [0]:
documents = [(word_tokenize(word),sentiment) for word,sentiment in documents]

In [37]:
sample_text = "Does This thing really work? Lets see."
words = word_tokenize(sample_text.lower())
words

['does', 'this', 'thing', 'really', 'work', '?', 'lets', 'see', '.']

In [0]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [0]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if(tag.startswith('J')):
        return wordnet.ADJ
    elif(tag.startswith('V')):
        return wordnet.VERB
    elif(tag.startswith('N')):
        return wordnet.NOUN
    elif(tag.startswith('R')):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [40]:
from nltk import pos_tag
w = 'better'
pos_tag([w])

[('better', 'RBR')]

In [41]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops.update(['http'])
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'http',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself

In [0]:
def clean_review(words):
    output_words = []
    for w in words:
        if(w.lower() not in stops):
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word)
    return output_words  

In [0]:
documents = [(clean_review(document),category) for document,category in documents]

In [0]:
import random 
random.shuffle(documents)

In [0]:
training_doc = documents[0:27000]
testing_doc = documents[27000:]

In [0]:
all_words = []
for doc in training_doc:
    all_words += doc[0]

In [0]:
freq = nltk.FreqDist(all_words)
common  = freq.most_common(500)
features = [i[0] for i in  common]

In [63]:
features[0:5]

['Modi', 'http', 'RT', "'s", 'PM']

In [0]:
def get_feature_dict(words):
    word_set = set(words)
    current_features = {}
    for w in features:
        current_features[w] = w in word_set    
    return current_features   

In [0]:
training_data = [(get_feature_dict(doc),category) for doc,category in training_doc]

In [0]:
def prepare_dataset(d,cat):
  x_train = []
  y_train = []
  for k,v in d.items():
    if v:
      x_train.append(1)
    else:
      x_train.append(0)
  y_train.append(bool(cat))
  return x_train,y_train 

In [0]:
x_train = []
y_train = []
for doc,category in training_data:
  x,y = prepare_dataset(doc,category)
  x_train.append(x)
  y_train.append(y)

In [0]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [0]:
def model():
  model = Sequential()
  model.add(Dense(64, activation='relu', input_dim=500))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model

In [70]:
from keras.utils import to_categorical
y_train1 = to_categorical(y_train)
y_train1

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [71]:
model = model()
model.fit(x_train,y_train,epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fc96121aba8>

In [0]:
testing_data = [(get_feature_dict(doc),category) for doc,category in testing_doc]

In [0]:
x_test = []
y_test = []
for doc,category in testing_data:
  x,y = prepare_dataset(doc,category)
  x_test.append(x)
  y_test.append(y)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

In [86]:
score, acc = model.evaluate(x_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.006453535990817273
Test accuracy: 0.9965555555555555
