In [1]:
import pandas
pandas.set_option('display.max_colwidth', 200)

In [2]:
tweets = pandas.read_csv('data/train.csv')

In [638]:
tweets

Unnamed: 0,Id,Category,Tweet
0,635769805279248384,negative,Not Available
1,635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check if my 3rd party network pod supports it http://t.co/fmtcfUAdgj
2,635950258682523648,neutral,"Mar if you have an iOS device, you should download our app too: http://t.co/gl3tn2uDnD"
3,636030803433009153,negative,@jimmie_vanagon my phone does not run on latest IOS which may account for problem the other day .. time it was replaced
4,636100906224848896,positive,Not sure how to start your publication on iOS? We'll be live helping with ask me anything sessions today and Friday http://t.co/KPqqGjjh3x
...,...,...,...
5965,639016598477651968,neutral,@YouAreMyArsenal Wouldn't surprise me if we enquired.He can't be 100% happy playing 2nd fiddle to Zlatan but he's not worth PSG asking price
5966,640276909633486849,neutral,Rib injury for Zlatan against Russia is a big blow if he misses Austria game Tuesday. A chance for new Sunderland striker Toivonen #SAFC
5967,640296841725235200,neutral,"Noooooo! I was hoping to see Zlatan being Zlatan in Tuesday! Oh well, still looking forward to the match. https://t.co/swGyd9cQAJ"
5968,641017384908779520,neutral,Not Available


In [3]:
tweets.Category.value_counts(normalize=True)

positive    0.483752
neutral     0.355946
negative    0.160134
Tweet       0.000168
Name: Category, dtype: float64

In [4]:
def cleanup(tweets):
    tweets = tweets.dropna()
    tweets = tweets.drop(columns=['Id'])
    tweets = tweets[tweets.Category != 'Tweet']
    tweets = tweets[tweets.Tweet != 'Not Available']
    tweets = tweets[tweets.Tweet != '']
    return tweets

In [5]:
tweets = cleanup(tweets)
tweets

Unnamed: 0,Category,Tweet
1,neutral,IOS 9 App Transport Security. Mm need to check if my 3rd party network pod supports it http://t.co/fmtcfUAdgj
2,neutral,"Mar if you have an iOS device, you should download our app too: http://t.co/gl3tn2uDnD"
3,negative,@jimmie_vanagon my phone does not run on latest IOS which may account for problem the other day .. time it was replaced
4,positive,Not sure how to start your publication on iOS? We'll be live helping with ask me anything sessions today and Friday http://t.co/KPqqGjjh3x
5,neutral,"Two Dollar Tuesday is here with Forklift 2, QuickKey for iOS and Suite for Pages for just $1.99 today: http://t.co/BNMFOEACw5"
...,...,...
5963,positive,"Ok ed let's do this, Zlatan, greizmann and Laporte tomorrow make it happen"
5964,neutral,Goal level: Zlatan 90k by Friday? = Posting every other day #DSGS (Vine by @ElexAuerbach) https://t.co/BPUM3A8tSD
5965,neutral,@YouAreMyArsenal Wouldn't surprise me if we enquired.He can't be 100% happy playing 2nd fiddle to Zlatan but he's not worth PSG asking price
5966,neutral,Rib injury for Zlatan against Russia is a big blow if he misses Austria game Tuesday. A chance for new Sunderland striker Toivonen #SAFC


In [6]:
categories = tweets.Category.unique()
tweets.Category.value_counts(normalize=True)

positive    0.479432
neutral     0.360266
negative    0.160303
Name: Category, dtype: float64

In [7]:
import re
from nltk.tokenize.casual import casual_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
import string

stops = {'in', 'of', 'at', 'a', 'the', 'to', 'on', 'and', 'it'}
stops.update(string.punctuation)
stops.difference_update('?!')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def tag_for_lemmatizer(tag):
    if tag.startswith('NN'):
        return 'n'
    if tag.startswith('VB'):
        return 'v'
    return 'a'

def preprocess(text, lemmatize=True):
    if not text or type(text) != str:
        return ''

    text = text.lower()
    text = re.sub(r"https?://[^\s]+", '', text) # hyperlinks
    text = re.sub(r"\@\w+", '', text) # mentions
    text = re.sub(r"#", '', text) # hashtags
    text = re.sub(r"\d+\w*", '', text) # numbers
    text = re.sub(r"'s", '', text) # possesive
    text = re.sub(r"n't", ' not', text) # contractions
    
    words = [word for word in casual_tokenize(text) if word not in stops]
    
    if lemmatize:
        words = [
            lemmatizer.lemmatize(word, tag_for_lemmatizer(tag))
            for word, tag in pos_tag(words)
        ]
    else:
        words = [
            stemmer.stem(word)
            for word in words
        ]
    text = ' '.join(words)
    return text

In [8]:
tweets.Tweet = tweets.Tweet.apply(preprocess)
tweets

Unnamed: 0,Category,Tweet
1,neutral,io app transport security mm need check if my party network pod support
2,neutral,mar if you have an ios device you should download our app too
3,negative,my phone do not run late io which may account for problem other day .. time be replace
4,positive,not sure how start your publication io ? we'll be live help with ask me anything session today friday
5,neutral,two dollar tuesday be here with forklift quickkey for io suite for page for just today
...,...,...
5963,positive,ok ed let do this zlatan greizmann laporte tomorrow make happen
5964,neutral,goal level zlatan by friday ? post every other day dsgs vine by
5965,neutral,would not surprise me if we enquired.he ca not be happy play fiddle zlatan but he not worth psg ask price
5966,neutral,rib injury for zlatan against russia be big blow if he miss austria game tuesday chance for new sunderland striker toivonen safc


In [9]:
import collections

def most_common_words(texts):
    counter = collections.Counter()
    for text in texts:
        words = {word for word in casual_tokenize(text) if word not in stops}
        counter.update(words)
    return counter.most_common(50)

for category in categories:
    subset = tweets[tweets.Category == category].Tweet
    print(category)
    print(most_common_words(subset))
    print()

neutral
[('be', 890), ('i', 408), ('may', 391), ('not', 371), ('have', 353), ('for', 352), ('...', 350), ('with', 314), ('?', 294), ('you', 279), ('that', 272), ('do', 266), ('tomorrow', 238), ('but', 192), ('go', 178), ('just', 175), ('will', 164), ('he', 163), ('my', 158), ('!', 157), ('if', 150), ('get', 149), ('day', 149), ('this', 148), ('about', 132), ('say', 131), ('like', 129), ('so', 125), ('time', 115), ('me', 111), ('from', 110), ('out', 110), ('as', 107), ('make', 106), ('what', 103), ('or', 99), ('sunday', 97), ('new', 96), ('up', 92), ('one', 91), ('see', 90), ('all', 89), ('watch', 87), ('an', 85), ('we', 85), ('by', 85), ('want', 83), ('when', 83), ('think', 83), ("i'm", 83)]

negative
[('be', 460), ('not', 229), ('may', 213), ('i', 204), ('have', 180), ('do', 150), ('for', 138), ('that', 137), ('with', 118), ('you', 117), ('...', 115), ('?', 111), ('my', 104), ('!', 95), ('just', 95), ('but', 90), ('tomorrow', 90), ('this', 83), ('get', 81), ('he', 79), ('like', 73), (

In [648]:
tweets.sample(5)

Unnamed: 0,Category,Tweet
5549,neutral,baylee i've have boyfriend for every valentine day since grade hallie mine opposite ..
1299,neutral,day after labor day be just monday that lie about be tuesday
1749,neutral,i'm not sure which be funny coloured-vinyl reissue mariah carey christmas album or fact that come out october
5437,positive,credit phenomenal retired english teacher jan knispel from valentine for great analogy #mohreng
5528,neutral,i know for lot people valentine day but for some just saturday


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# def resample(train):
#     n = 500
#     return pandas.concat([
#         train[train.Category == category].sample(n, replace=False)
#         for category in categories
#     ])

def train_model(train, **kwargs):
    vectorizer = TfidfVectorizer(binary=False, stop_words=stops, ngram_range=(1, 3))
    classifier = LinearSVC(class_weight='balanced', **kwargs)
    
    train_bow = vectorizer.fit_transform(train.Tweet)
    classifier.fit(train_bow, train.Category)
    
    return vectorizer, classifier

def accuracy(train, test, **kwargs):
    vectorizer, classifier = train_model(train, **kwargs)
    
    test_bow = vectorizer.transform(test.Tweet)
    predictions = classifier.predict(test_bow)
    hits = (predictions == test.Category).sum()
    total = test.shape[0]
    return hits / total

In [12]:
import numpy as np
cv_folds = 5
batches = np.random.randint(0, cv_folds, len(tweets))

best_c = None
best_accuracy = 0
for c in [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]:
    accuracies = []
    for i in range(cv_folds):
        train_idx = batches != i
        test_idx = batches == i
        acc = accuracy(tweets[train_idx], tweets[test_idx], C=c) # solver='newton-cg', multi_class='multinomial'
        accuracies.append(acc)

    mean = np.mean(accuracies)
    print(f"{c}, acc={mean}")
    if mean > best_accuracy:
        best_c = c
        best_accuracy = mean

print(f"c*={best_c}")

0.01, acc=0.5206080255617336
0.02, acc=0.554269271751949
0.05, acc=0.5717186684204584
0.1, acc=0.5767035935808987
0.2, acc=0.5800167447964578
0.5, acc=0.5838875012572424
1, acc=0.5777356340824322
2, acc=0.5791851231844538
5, acc=0.576581498797921
10, acc=0.5760463644296849
c*=0.5


In [389]:
# 0.5, acc=0.5838875012572424

In [24]:
vectorizer, classifier = train_model(tweets, C=best_c)

In [25]:
test_tweets = pandas.read_csv('data/test.csv', dtype={'Id': str, 'Tweet': str})
test_tweets.Tweet = test_tweets.Tweet.apply(preprocess)
test_bow = vectorizer.transform(test_tweets.Tweet)
test_tweets['Category'] = classifier.predict(test_bow)

In [31]:
print(test_tweets.Category.value_counts(normalize=True))

test_tweets[['Tweet', 'Category']].tail(20)

positive    0.50975
neutral     0.39175
negative    0.09850
Name: Category, dtype: float64


Unnamed: 0,Tweet,Category
3980,trynna go ihop but all my friend be asleep so i may just go by myself whenindoubtpancakeitout,positive
3981,ihop wit bubba jay tomorrow he good eat too,positive
3982,emt krakatoa ]: so when high point your saturday be chorizo omelette ihop for late dinn ...,positive
3983,who down go ihop with me tomorrow morning ?,neutral
3984,me either ... but we get off early friday so some coworkers go ihop ... rest be history,positive
3985,definitely go ihop tomorrow,positive
3986,get pay next friday so i get buy koko i some food from ihop after game,positive
3987,dude i swear friday i be go go take you ihop but danny have go home cu he be pain,positive
3988,if i didnt have work tomorrow i really would man bring your family ihop tomorrow morning illserve you,positive
3989,i'm go ihop tomorrow,positive


In [16]:
test_tweets.Category.value_counts(normalize=True)

positive    0.50975
neutral     0.39175
negative    0.09850
Name: Category, dtype: float64

In [23]:
def classify(text):
    text = preprocess(text)
    test_bow = vectorizer.transform([text])
    predictions = classifier.predict(test_bow)
    return predictions[0]

phrases = ["I loved it!", "I don't know what to say", "What a fucking piece of shit"]
{ phrase: classify(phrase) for phrase in phrases }

ValueError: X has 101392 features per sample; expecting 123915

In [18]:
feature_to_coef = {
    word: coef for word, coef in zip(
        vectorizer.get_feature_names(), classifier.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)

for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('fuck', 2.2756533856175727)
('parenthood', 1.8303820030786722)
('plan parenthood', 1.6755022527605148)
('not', 1.670608196986015)
('monsanto', 1.6181405229143497)
('best', -1.136194646180196)
('seinfeld', -1.1082058924890865)
('good', -0.968385358699419)
('friday', -0.9614132949795304)
('new', -0.9606476501187343)


In [19]:
with open('predictions.csv', 'wt') as f:
    f.write("Id,Category\n")
    for i, row in test_tweets.iterrows():
        f.write(f"{row.Id},{row.Category}\n")

# Vectorizer + neural network

In [20]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout, Bidirectional, Flatten
from sklearn.preprocessing import OneHotEncoder

def train_neural_net(df, train_idx):
    train = df[train_idx]

    vectorizer = TfidfVectorizer(binary=False, ngram_range=(1, 3))
    train_bow = vectorizer.fit_transform(train.Tweet)
    
    model = Sequential([
        Dense(8, input_shape=(len(vectorizer.vocabulary_),), activation='relu'),
        Dropout(0.3),
        Dense(8, activation='relu'),
        Dense(3, activation='softmax')
    ])
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    batch_size = 64
    num_epochs = 10
    
    y = np.column_stack([
        1 * (train.Category == category)
        for category in categories
    ])
    print(y.shape, train_bow.shape)
    model.fit(train_bow, y, batch_size=batch_size, epochs=num_epochs)
    return vectorizer, model

train_idx = batches != 0
test_idx = batches == 0

vectorizer, nn = train_neural_net(tweets, batches != 0)

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 811144    
_________________________________________________________________
dropout_1 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 27        
Total params: 811,243
Trainable params: 811,243
Non-trainable params: 0
_________________________________________________________________
(4312, 3) (4312, 101392)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix

test_bow = vectorizer.transform(tweets[test_idx].Tweet)
predictions = categories[nn.predict_classes(test_bow)]

print(accuracy_score(tweets[test_idx].Category, predictions))

confusion_matrix(tweets[test_idx].Category, predictions)

# hits = (predictions == ).sum()
# total = tweets[test_idx].shape[0]
# hits / total

0.5256988277727682


array([[ 14, 143,  27],
       [ 10, 257, 115],
       [  5, 226, 312]])

In [719]:
vectorizer, nn = train_neural_net(tweets)
test_bow = vectorizer.transform(test_tweets.Tweet)
test_tweets['Category'] = categories[nn.predict_classes(test_bow)]

with open('predictions.csv', 'wt') as f:
    f.write("Id,Category\n")
    for i, row in test_tweets.iterrows():
        f.write(f"{row.Id},{row.Category}\n")

Model: "sequential_44"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_79 (Dense)             (None, 8)                 992104    
_________________________________________________________________
dropout_8 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_80 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_81 (Dense)             (None, 3)                 27        
Total params: 992,203
Trainable params: 992,203
Non-trainable params: 0
_________________________________________________________________
(5421, 3) (5421, 124012)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Recurrent neural network

In [738]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [811]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets.Tweet)

dir(tokenizer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_keras_api_names',
 '_keras_api_names_v1',
 'char_level',
 'document_count',
 'filters',
 'fit_on_sequences',
 'fit_on_texts',
 'get_config',
 'index_docs',
 'index_word',
 'lower',
 'num_words',
 'oov_token',
 'sequences_to_matrix',
 'sequences_to_texts',
 'sequences_to_texts_generator',
 'split',
 'texts_to_matrix',
 'texts_to_sequences',
 'texts_to_sequences_generator',
 'to_json',
 'word_counts',
 'word_docs',
 'word_index']

In [740]:
sequences = tokenizer.texts_to_sequences(tweets.Tweet)

max_words = max(len(sequence) for sequence in sequences)
sequences = sequence.pad_sequences(sequences, maxlen=max_words)

In [800]:
from keras import Sequential
from keras.layers import Input, Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional

vocab_size = len(tokenizer.word_index) + 1
embedding_size = 128

# model = Sequential([
#     Embedding(vocab_size, 128, input_shape=(max_words,)),
#     LSTM(64),
#     Dense(16, activation='relu'),
#     Dropout(0.5),
#     Dense(16, activation='relu'),
#     Dense(3, activation='softmax')
# ])

model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_words))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.summary()

Model: "sequential_71"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_36 (Embedding)     (None, 32, 128)           1182976   
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 32, 128)           0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 100)               91600     
_________________________________________________________________
dense_153 (Dense)            (None, 3)                 303       
Total params: 1,274,879
Trainable params: 1,274,879
Non-trainable params: 0
_________________________________________________________________


In [759]:
categories_1h = np.column_stack([
    1 * (tweets.Category == category)
    for category in categories
])

In [814]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(sequences, categories_1h)
# x_train, x_valid, y_train, y_valid = train_test_split(x_train0, y_train0)

In [824]:
[tokenizer.index_word[idx] for idx in x_train[0,:] if idx > 0]
categories[y_test.argmax(1)[0]]

'positive'

In [801]:
batch_size = 64
num_epochs = 10

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=num_epochs)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x16c723c18>

In [802]:
# categories[]
predictions = model.predict_classes(x_test)
true_labels = y_test.argmax(1)

accuracy_score(true_labels, predictions)

0.551622418879056