In [1]:
import pandas
import os

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



In [54]:
# Build a confusion matrix and
# calculate evaluation metrics using it
def summarize_metrics(tp, tn, fp, fn):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f1_score = (2 * recall * precision) / (recall + precision)

    print("Precison:", precision)
    print("Recall:", recall)
    print("Accuracy:", accuracy)
    print("F1 score:", f1_score)

In [3]:
if __name__ == "__main__":
    status_data = pandas.read_csv("mypersonality_final.csv")

    NEG_INDEX = 2
    POS_INDEX = 3
    NEU_INDEX = 4
    COMP_INDEX = 5

    # Annotate the status with sentiment scores
    # From nltk.sentiment.vader corpus
    if not os.path.isfile("mypersonality_cleaned.csv"):
        status_data.insert(NEG_INDEX, "sentiNEG", 0)
        status_data.insert(POS_INDEX, "sentiPOS", 0)
        status_data.insert(NEU_INDEX, "sentiNEU", 0)
        status_data.insert(COMP_INDEX, "sentiCOMPOUND", 0)

        sid = SentimentIntensityAnalyzer()
        count = 0
        for row in status_data.itertuples():
            """
            pos: positive
            neg: negative
            neu: neutral
            compound: aggregated score for the sentence
            """
            ss = sid.polarity_scores(row.STATUS)
            status_data.iloc[count, NEG_INDEX] = ss["neg"]
            status_data.iloc[count, POS_INDEX] = ss["pos"]
            status_data.iloc[count, NEU_INDEX] = ss["neu"]
            status_data.iloc[count, COMP_INDEX] = ss["compound"]
            count += 1

        status_data.to_csv("mypersonality_cleaned.csv")
    else:
        status_data = pandas.read_csv("mypersonality_cleaned.csv")

In [5]:
# Drop NAs
status_data = status_data.dropna()

In [7]:
# We drop columns which give us a score for personality type
status_data = status_data.drop(['STATUS', '#AUTHID', 'sEXT', 'sNEU', 'sAGR',
                                    'sCON', 'sOPN', 'DATE'], axis=1)

In [8]:
# Drop non-normalized scores of Brokerage and Betweenness
status_data = status_data.drop(['BROKERAGE', 'BETWEENNESS', 'NBROKERAGE',
                                    'NBETWEENNESS', 'DENSITY', 'TRANSITIVITY', 'NETWORKSIZE'], axis=1)

In [9]:
status_data

Unnamed: 0.1,Unnamed: 0,sentiNEG,sentiPOS,sentiNEU,sentiCOMPOUND,cEXT,cNEU,cAGR,cCON,cOPN
0,0,0.000,0.412,0.588,0.4215,n,y,n,n,y
1,1,0.167,0.000,0.833,-0.3412,n,y,n,n,y
2,2,0.195,0.278,0.527,0.6280,n,y,n,n,y
3,3,0.000,0.259,0.741,0.4215,n,y,n,n,y
4,4,0.000,0.592,0.408,0.4404,n,y,n,n,y
5,5,0.000,0.000,1.000,0.0000,n,y,n,n,y
6,6,0.000,0.515,0.485,0.8916,n,y,n,n,y
7,7,0.000,0.000,1.000,0.0000,n,y,n,n,y
8,8,0.188,0.053,0.759,-0.6249,n,y,n,n,y
9,9,0.000,0.323,0.677,0.7351,n,y,n,n,y


In [10]:
# Change the name of first row from "Unknown" to "rowID"
new_columns = status_data.columns.values
new_columns[0] = "rowID"
status_data.columns = new_columns

In [11]:
# Put the columns to be predicted, at the end
cols = status_data.columns.tolist()
cols = cols[:5] + cols[5:10]
status_data = status_data[cols]

In [12]:
status_data

Unnamed: 0,rowID,sentiNEG,sentiPOS,sentiNEU,sentiCOMPOUND,cEXT,cNEU,cAGR,cCON,cOPN
0,0,0.000,0.412,0.588,0.4215,n,y,n,n,y
1,1,0.167,0.000,0.833,-0.3412,n,y,n,n,y
2,2,0.195,0.278,0.527,0.6280,n,y,n,n,y
3,3,0.000,0.259,0.741,0.4215,n,y,n,n,y
4,4,0.000,0.592,0.408,0.4404,n,y,n,n,y
5,5,0.000,0.000,1.000,0.0000,n,y,n,n,y
6,6,0.000,0.515,0.485,0.8916,n,y,n,n,y
7,7,0.000,0.000,1.000,0.0000,n,y,n,n,y
8,8,0.188,0.053,0.759,-0.6249,n,y,n,n,y
9,9,0.000,0.323,0.677,0.7351,n,y,n,n,y


In [13]:
# 'y' for 1 and 'n' for 0
features = ['cEXT', 'cNEU', 'cOPN', 'cAGR', 'cCON']
for feature in features:
    status_data[feature] = status_data[feature].map({'y': 1.0, 'n': 0.0}).astype(int)

In [14]:
# Split into training and test data: 66% and 33%
train_data, test_data = train_test_split(status_data, test_size=0.50)

train = train_data.values
test = test_data.values

In [15]:
# Build a classifier
# k is chosen to be square root of number of training example
model = KNeighborsClassifier(n_neighbors=250)
model = model.fit(train[0:, 1:5], train[0:, 7])

In [31]:
train[0]

array([  1.46200000e+03,   3.89000000e-01,   1.90000000e-01,
         4.21000000e-01,  -2.24400000e-01,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00])

In [16]:
# Predict
output = model.predict(test[:, 1:5])
rowID = [TEST.rowID for TEST in test_data.itertuples()]
result_df = pandas.DataFrame({"rowID": rowID,"cOPN": list(output)})

In [17]:
# Build the confusion matrix to assess the model
tp_count = 0
tn_count = 0
fp_count = 0
fn_count = 0
for row in rowID:
    test_cEXT_val = int(test_data.loc[test_data['rowID'] == row].cOPN)
    result_cEXT_val = int(result_df.loc[result_df['rowID'] == row].cOPN)
    if test_cEXT_val == 1:
        if result_cEXT_val == 1:
            tp_count += 1
        else:
            fn_count += 1
    else:
        if result_cEXT_val == 1:
            fp_count += 1
        else:
            tn_count += 1

print(tp_count, tn_count, fp_count, fn_count)
summarize_metrics(tp_count, tn_count, fp_count, fn_count)

3265 121 1148 424
Precison: 0.7398595060049853
Recall: 0.8850637029005151
Accuracy: 0.6829366680112948
F1 score: 0.8059738336213281


In [18]:
model_SVM = SVC()

In [19]:
model_SVM = model_SVM.fit(train[0:, 1:5], train[0:, 7])

In [20]:
# Predict_SVM
output_SVM = model_SVM.predict(test[:, 1:5])
rowID_SVM = [TEST.rowID for TEST in test_data.itertuples()]
result_df_SVM = pandas.DataFrame({"rowID": rowID,"cOPN": list(output_SVM)})

In [23]:
# Build the confusion matrix to assess the model
tp_count = 0
tn_count = 0
fp_count = 0
fn_count = 0
for row in rowID:
    test_cEXT_val = int(test_data.loc[test_data['rowID'] == row].cOPN)
    result_cEXT_val_SVM = int(result_df_SVM.loc[result_df['rowID'] == row].cOPN)
    if test_cEXT_val == 1:
        if result_cEXT_val_SVM == 1:
            tp_count += 1
        else:
            fn_count += 1
    else:
        if result_cEXT_val_SVM == 1:
            fp_count += 1
        else:
            tn_count += 1

print(tp_count, tn_count, fp_count, fn_count)
summarize_metrics(tp_count, tn_count, fp_count, fn_count)

3545 33 1236 144
Precison: 0.7414766785191382
Recall: 0.9609650311737599
Accuracy: 0.7216619604679306
F1 score: 0.8370720188902008


In [24]:
from sklearn.naive_bayes import GaussianNB
model_gnb = GaussianNB()

In [25]:
model_gnb = model_gnb.fit(train[0:, 1:5], train[0:, 7])

In [27]:
train

array([[  1.46200000e+03,   3.89000000e-01,   1.90000000e-01, ...,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [  4.54600000e+03,   2.71000000e-01,   0.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00],
       [  5.78300000e+03,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00],
       ..., 
       [  9.20000000e+03,   3.58000000e-01,   0.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  9.84000000e+03,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  8.50900000e+03,   0.00000000e+00,   0.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00]])

In [32]:
# Predict_gnb
output_gnb = model_gnb.predict(test[:, 1:5])
rowID_gnb = [TEST.rowID for TEST in test_data.itertuples()]
result_df_gnb = pandas.DataFrame({"rowID": rowID,"cOPN": list(output_gnb)})

In [33]:
# Build the confusion matrix to assess the model
tp_count = 0
tn_count = 0
fp_count = 0
fn_count = 0
for row in rowID:
    test_cEXT_val = int(test_data.loc[test_data['rowID'] == row].cOPN)
    result_cEXT_val_gnb = int(result_df_gnb.loc[result_df['rowID'] == row].cOPN)
    if test_cEXT_val == 1:
        if result_cEXT_val_gnb == 1:
            tp_count += 1
        else:
            fn_count += 1
    else:
        if result_cEXT_val_gnb == 1:
            fp_count += 1
        else:
            tn_count += 1

print(tp_count, tn_count, fp_count, fn_count)
summarize_metrics(tp_count, tn_count, fp_count, fn_count)

2935 240 1029 754
Precison: 0.7404137235116044
Recall: 0.795608566007048
Accuracy: 0.6403791851553046
F1 score: 0.7670194694890893


In [5]:
# Second Approach, using Doc2Vec and LSTM for predicting
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence



In [1]:
from os import listdir
from os.path import isfile, join
docLabels_y = []
docLabels_y = [f for f in listdir("./Dataset Processed/txt output_final/y") if f.endswith('.txt')]
docLabels_n = []
docLabels_n = [f for f in listdir("./Dataset Processed/txt output_final/n") if f.endswith('.txt')]

In [8]:
data = []
data_y = []
for doc in docLabels_y:
    f = open("./Dataset Processed/txt output_final/y/" + doc, 'r')
    data_y.append(f)
    data.append(f)
    f.close()
data_n = []
for doc in docLabels_n:
    f1 = open("./Dataset Processed/txt output_final/n/" + doc, 'r')
    data_n.append(f1)
    data.append(f1)
    f1.close()

In [19]:
# LabeledSentence = gensim.models.doc2vec.LabeledSentence  
  
# class LabeledLineSentence(object):  
#     def __init__(self, sentences):  
#         self.sentences = sentences  
#     def __iter__(self):  
#         for id, line in enumerate(sentences):  
#             yield LabeledSentence(words=line, tags=['SENT_%s' % id])  
#     def sentences_perm(self):
#         shuffle(self.sentences)
#         return self.sentences
              
# it = LabeledLineSentence(data)  

In [27]:
model = gensim.models.Doc2Vec(size=100, window=10, min_count=1, workers=11, alpha=0.025, min_alpha=0.025)  
model.build_vocab(it) 

2018-04-25 00:00:04,754 : INFO : collecting all words and their counts


NameError: name 'sentences' is not defined

In [22]:
import logging
import os
import sys
from random import shuffle
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

for epoch in range(100):
    logger.info('Epoch %d' % epoch)
    model.train(it.sentences_perm(),
                #total_examples=model.corpus_count,
                epochs=model.iter,
    )

model.save('./Doc2Vec_model.d2v')

2018-04-24 23:58:19,747 : INFO : running C:\Anaconda2\envs\tensorflow\lib\site-packages\ipykernel\__main__.py -f C:\Users\tianx\AppData\Roaming\jupyter\runtime\kernel-1525645a-5d42-4290-ad5f-0978609f7a3f.json
2018-04-24 23:58:19,752 : INFO : Epoch 0
2018-04-24 23:58:19,770 : INFO : training model with 11 workers on 0 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=10


RuntimeError: you must first build vocabulary before training the model

In [26]:
class LabeledLineSentence(object):

    def __init__(self, sources):
        self.sources = sources

        flipped = {}

        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')

    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(
                        utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [102]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# shuffle
from random import shuffle

# logging
import logging
import os.path
import sys
import _pickle as pickle
#import cPickle as pickle   #Note: in python3, _pickle was used instead of cpickle

program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

class LabeledLineSentence(object):

    def __init__(self, sources):
        self.sources = sources

        flipped = {}

        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')

    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(
                        utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

sources = {'./Dataset Processed/txt output_final/n/sum/n_all_test.txt':'TEST_NEG', './Dataset Processed/txt output_final/y/sum/y_all_test.txt':'TEST_POS', './Dataset Processed/txt output_final/n/sum/n_all_train.txt':'TRAIN_NEG', './Dataset Processed/txt output_final/y/sum/y_all_train.txt':'TRAIN_POS'}

sentences = LabeledLineSentence(sources)

model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

for epoch in range(100):
    logger.info('Epoch %d' % epoch)
    model.train(sentences.sentences_perm(),
                total_examples=model.corpus_count,
                epochs=model.iter,
    )

model.save('./imdb.d2v')

2018-04-25 14:42:12,740 : INFO : running C:\Anaconda2\envs\tensorflow\lib\site-packages\ipykernel\__main__.py -f C:\Users\tianx\AppData\Roaming\jupyter\runtime\kernel-2864b082-d451-44be-9948-6e712fdda201.json
2018-04-25 14:42:12,893 : INFO : collecting all words and their counts
2018-04-25 14:42:12,897 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-04-25 14:42:12,973 : INFO : collected 15770 word types and 5364 unique tags from a corpus of 5364 examples and 72630 words
2018-04-25 14:42:12,981 : INFO : Loading a fresh vocabulary
2018-04-25 14:42:13,096 : INFO : min_count=1 retains 15770 unique words (100% of original 15770, drops 0)
2018-04-25 14:42:13,103 : INFO : min_count=1 leaves 72630 word corpus (100% of original 72630, drops 0)
2018-04-25 14:42:13,282 : INFO : deleting the raw counts dictionary of 15770 items
2018-04-25 14:42:13,287 : INFO : sample=0.0001 downsamples 406 most-common words
2018-04-25 14:42:13,291 : INFO : downsampling leaves e

In [103]:
model.wv.most_similar('wow')

2018-04-25 14:48:12,542 : INFO : precomputing L2-norms of word weight vectors


[('orleans.', 0.6345005035400391),
 ('glow', 0.6184808015823364),
 ('sticks', 0.5736362934112549),
 ('funk...i', 0.5610934495925903),
 ('cod4mw2', 0.5446567535400391),
 ('frog.', 0.5442479848861694),
 ('soo', 0.5269519090652466),
 ('convos', 0.51676344871521),
 ('buddies...', 0.4994044899940491),
 ('simplest', 0.4900153875350952)]

In [3]:
model

<gensim.models.doc2vec.Doc2Vec at 0x29138053710>

In [4]:
model.most_similar('lol')

[('loser!!!!', 0.6319977045059204),
 ('booga', 0.6215949058532715),
 ('courage!', 0.6092036962509155),
 ('booga!', 0.6054763793945312),
 ('me!!!', 0.6025193929672241),
 ('burg...someone', 0.5988327264785767),
 ('stranded', 0.5970384478569031),
 ('Mulan...', 0.5725835561752319),
 ('???', 0.5678666234016418),
 ('boom', 0.5648619532585144)]

In [5]:
model.most_similar('yo')

[('numbers!', 0.783098578453064),
 ('needz', 0.7755323648452759),
 ('phone,', 0.7491334676742554),
 ('number?', 0.7489640712738037),
 ('fo', 0.7310982942581177),
 ('what?', 0.7217720746994019),
 ('birfday,', 0.7177667021751404),
 ('gotz', 0.6787748336791992),
 ('diamonds', 0.675055742263794),
 ('divide//', 0.6602415442466736)]

In [6]:
model.most_similar('ass')

[('haha..', 0.61693274974823),
 ('compell', 0.6105270385742188),
 ('scrabble', 0.5972521901130676),
 ('pack...we', 0.594215989112854),
 ('Yay...', 0.5926961898803711),
 ('Jesus:', 0.5818872451782227),
 ('Day.', 0.578163206577301),
 ('Columbus', 0.5687612295150757),
 ('gresy', 0.5616876482963562),
 ('boyfriend', 0.5551649332046509)]

In [7]:
model.most_similar('cool')

[('Independence.', 0.6609710454940796),
 ('Humboldt', 0.6597853899002075),
 ('declaration', 0.6468594670295715),
 ('wow...', 0.6215463876724243),
 ('stories...', 0.6150673031806946),
 ('Massachusetts.', 0.6051443219184875),
 ('involving', 0.6025294661521912),
 ('TJ', 0.5930542945861816),
 ('aching,', 0.5917971134185791),
 ('ease', 0.5917605757713318)]

In [8]:
model.most_similar('haha')

[('TRAGEDY!!!!!!!!!!!', 0.7178655862808228),
 ('car....yo!', 0.7166972160339355),
 ('claimed', 0.7024919986724854),
 ('frnt', 0.6944730281829834),
 ('evry1!!!!', 0.6852693557739258),
 ('rockin', 0.6767828464508057),
 ('statistics.', 0.6531293392181396),
 ('style', 0.6427193284034729),
 ('perform', 0.6240640878677368),
 ('door..mwawaahh', 0.6224106550216675)]

In [108]:
train_arrays = numpy.zeros((3578,100))
train_labels = numpy.zeros((3578))
for i in range(1627):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_labels[i] = 1
for i in range(1949):
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i+1627] = model.docvecs[prefix_train_neg]
    train_labels[i+1627] = 0

In [10]:
model.docvecs['TRAIN_NEG_1949']

array([-5.3739661e-01, -2.1580108e-01, -4.8335126e-01,  1.1482675e+00,
       -4.7941580e-02, -4.4821692e-01,  3.9039350e-01, -1.2100271e+00,
       -2.7058744e-01,  4.6899474e-01, -7.1310796e-02, -7.3290646e-01,
        3.4350872e-01, -9.1766286e-01, -8.0835754e-01, -6.1130118e-02,
        6.5094030e-01,  1.5644893e-01,  9.8188169e-02,  2.9968953e-01,
       -9.5501977e-01, -4.3682599e-01, -5.8097118e-01, -6.4757466e-01,
       -4.0122551e-01,  3.5669097e-01,  3.6208892e-01,  2.3384538e-01,
       -3.6113268e-01,  1.4055296e-03,  2.6255829e-02,  2.3620425e-02,
       -3.5107261e-01, -1.4189466e+00,  3.8090906e-01, -5.7261753e-01,
        7.8484339e-01, -6.0194635e-01,  7.6612008e-01, -4.1130096e-01,
       -5.9421396e-01, -4.5939643e-02,  6.4430970e-01,  6.5054297e-01,
       -3.9242661e-01, -1.8527058e-01,  9.4514400e-01,  4.1067389e-01,
       -2.3936620e-01,  1.1376140e+00,  6.2980860e-01,  6.9525754e-01,
       -4.5010954e-01,  1.2126705e-01, -6.7568219e-01, -3.6877517e-02,
      

In [11]:
print(train_labels[1628])

0.0


In [12]:
print(train_arrays[0])

[ 0.80370075  0.38103873 -0.41746134  0.28001085  0.12257605 -0.27522427
 -0.0759928   0.44818515  0.1857688  -0.58987349  0.98279178 -0.37254193
  0.69479209 -0.15024538 -0.49857754  0.13502854  0.16639599 -0.14383537
  0.21795294 -0.09338123 -0.62064594 -0.35703933  0.41458052 -0.79077935
 -0.57588673 -0.33772352 -1.13289046  0.28825644 -1.06246459 -0.04594202
  0.85050446  0.65740472  0.04554173 -0.65110093  0.01392455 -0.00168079
  0.01303594 -0.55539489  0.28322324 -0.03220263 -0.17849761 -0.55766553
  0.67388767 -1.28418255 -0.62294739  0.09353312 -0.70558155 -0.82815075
 -0.74368787  0.45761847  0.19435069  0.36589673 -1.32936645 -0.04594028
 -0.15314017 -0.48406509 -0.2448221   0.0884188   0.11982979 -0.07147586
 -0.48553157  1.28627694  0.18354133  0.16572593  0.71938622  0.07535198
 -0.03131915 -1.17586052 -0.45244339 -0.53905445 -0.6917423  -0.23952721
  0.70727831 -0.68113184 -0.76739806 -0.61074823  0.9355523  -0.76929212
  0.29574177 -0.11356615 -0.2456612  -0.02541027 -0

In [109]:
test_arrays = numpy.zeros((1790,100))
test_labels = numpy.zeros((1790))
for i in range(814):
    prefix_test_pos = 'TEST_POS_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test_pos]
    test_labels[i] = 1
for i in range(974):
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i+814] = model.docvecs[prefix_test_neg]
    test_labels[i+814] = 0

In [14]:
print(test_labels[815])

0.0


In [128]:
# LSTM setup
max_features = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
maxlen = 120
batch_size = 64
nb_classes = 2   # remaining question?

In [111]:
print('train shape:', train_arrays.shape)
print('test shape:', test_arrays.shape)

train shape: (3578, 100)
test shape: (1790, 100)


In [112]:
print('train label shape:', train_labels.shape)
print('test label shape:', test_labels.shape)

train label shape: (3578,)
test label shape: (1790,)


In [28]:
test_labels[48]

1.0

In [36]:
from keras.utils import np_utils
Y_train = np_utils.to_categorical(train_labels, nb_classes)
Y_test = np_utils.to_categorical(test_labels, nb_classes)

In [37]:
Y_train

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [38]:
train_labels

array([1., 1., 1., ..., 0., 0., 0.])

In [42]:
test_arrays

array([[-0.4495672 , -0.25423512, -0.56111175, ...,  0.29188001,
        -0.06568915,  0.37901598],
       [-0.56589675,  0.6087445 , -0.00931993, ...,  0.32722384,
         0.59142751,  0.05790269],
       [-0.55978441,  0.29878759, -0.20974684, ...,  0.07978857,
         0.49747965,  0.24056254],
       ...,
       [-0.29563537,  0.79836869, -0.34482062, ...,  0.28694791,
         0.2336712 ,  0.08923047],
       [-0.61417621,  0.45285723, -0.38387924, ...,  0.17011005,
         0.41078898, -0.38659084],
       [ 0.20200892,  0.3076812 ,  0.72610861, ..., -0.64545828,
        -0.08853067, -0.0299416 ]])

In [142]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU

print('Build LSTM model...')
model = Sequential()
model.add(Embedding(100, 128,dropout=0.2))
# model.add(Embedding(
# #                     output_dim=EMBEDDING_DIM,
# #                     input_dim=100,
#                     input_length = maxlen,
#                     dropout=0.2))

model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(train_arrays, Y_train, batch_size=batch_size, nb_epoch=100,
          validation_data=(test_arrays, Y_test))
score, acc = model.evaluate(test_arrays,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds = model.predict_classes(test_arrays, verbose=0)

Build LSTM model...




ValueError: The first layer in a Sequential model must get an `input_shape` or `batch_input_shape` argument.

In [117]:
# Build a classifier
# k is chosen to be square root of number of training example
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
model = KNeighborsClassifier(n_neighbors=250)
model = model.fit(train_arrays, train_labels)
model.score(test_arrays, test_labels)

0.5631284916201117

In [118]:
model_svm_vector = SVC()
model_svm_vector = model_svm_vector.fit(train_arrays, train_labels)
model_svm_vector.score(test_arrays, test_labels)

0.5363128491620112

In [99]:
# test_labels[0]
# result_df_svm_vector[0][2]
# result_df_svm_vector.loc[0]
# output_svm_vector[1]
# len(test_labels)
len(output_svm_vector)

1790

In [119]:
# Build the confusion matrix to assess the model
import pandas
output_svm_vector = model_svm_vector.predict(test_arrays)
result_df_svm_vector = pandas.DataFrame(output_svm_vector)
# print(test_labels)
# print(result_df_svm_vector)

tp_count = 0
tn_count = 0
fp_count = 0
fn_count = 0
for i in range(0,1789):
    test_cEXT_val_vector = int(test_labels[i])
    result_cEXT_val_vector = int(output_svm_vector[i])
    if test_cEXT_val_vector == 1:
        if result_cEXT_val_vector == 1:
            tp_count += 1
        else:
            fn_count += 1
    else:
        if result_cEXT_val_vector == 1:
            fp_count += 1
        else:
            tn_count += 1

print(tp_count, tn_count, fp_count, fn_count)
summarize_metrics(tp_count, tn_count, fp_count, fn_count)




294 665 310 520
Precison: 0.4867549668874172
Recall: 0.36117936117936117
Accuracy: 0.5360536612632756
F1 score: 0.41466854724964736


In [121]:
# Build a KNN classifier
# k is chosen to be square root of number of training example
model_knn_vector = KNeighborsClassifier(n_neighbors=250)
model_knn_vector = model_knn_vector.fit(train_arrays, train_labels)
model_knn_vector.score(test_arrays, test_labels)

0.5631284916201117

In [122]:
# Build the confusion matrix to assess the model
import pandas
output_knn_vector = model_knn_vector.predict(test_arrays)
result_df_knn_vector = pandas.DataFrame(output_knn_vector)
# print(test_labels)
# print(result_df_svm_vector)

tp_count = 0
tn_count = 0
fp_count = 0
fn_count = 0
for i in range(0,1789):
    test_cEXT_val_vector = int(test_labels[i])
    result_cEXT_val_vector = int(output_knn_vector[i])
    if test_cEXT_val_vector == 1:
        if result_cEXT_val_vector == 1:
            tp_count += 1
        else:
            fn_count += 1
    else:
        if result_cEXT_val_vector == 1:
            fp_count += 1
        else:
            tn_count += 1

print(tp_count, tn_count, fp_count, fn_count)
summarize_metrics(tp_count, tn_count, fp_count, fn_count)

107 900 75 707
Precison: 0.5879120879120879
Recall: 0.13144963144963145
Accuracy: 0.562884292901062
F1 score: 0.21485943775100402


In [123]:
from sklearn.naive_bayes import GaussianNB
model_gnb_vector = GaussianNB()
model_gnb_vector = model_gnb_vector.fit(train_arrays, train_labels)
model_gnb_vector.score(test_arrays, test_labels)

0.4212290502793296

In [124]:
# Build the confusion matrix to assess the model
import pandas
output_gnb_vector = model_gnb_vector.predict(test_arrays)
result_df_gnb_vector = pandas.DataFrame(output_gnb_vector)
# print(test_labels)
# print(result_df_svm_vector)

tp_count = 0
tn_count = 0
fp_count = 0
fn_count = 0
for i in range(0,1789):
    test_cEXT_val_vector = int(test_labels[i])
    result_cEXT_val_vector = int(output_gnb_vector[i])
    if test_cEXT_val_vector == 1:
        if result_cEXT_val_vector == 1:
            tp_count += 1
        else:
            fn_count += 1
    else:
        if result_cEXT_val_vector == 1:
            fp_count += 1
        else:
            tn_count += 1

print(tp_count, tn_count, fp_count, fn_count)
summarize_metrics(tp_count, tn_count, fp_count, fn_count)

478 276 699 336
Precison: 0.40611724723874254
Recall: 0.5872235872235873
Accuracy: 0.42146450531022916
F1 score: 0.48016072325464587


In [126]:
# Try CNN
nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 2

In [127]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))

def max_1d(X):
    return K.max(X, axis=1)

model.add(Lambda(max_1d, output_shape=(nb_filter,)))
model.add(Dense(hidden_dims)) 
model.add(Dropout(0.2)) 
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


  app.launch_new_instance()


NameError: name 'Convolution1D' is not defined

In [143]:
# Using another approach for LSTM
max_features = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
maxlen = 80
batch_size = 32
nb_classes = 2

In [184]:
status_data = pandas.read_csv("./Dataset Processed/mypersonality_final_classifiedByClass_onlyColumn.csv",encoding='cp1252')
# Sort according to EXT, y first, no latter

In [185]:
# Drop NAs
status_data = status_data.dropna()

In [186]:
status_data

Unnamed: 0,STATUS
0,"is stuck on Band-Aid brand, cuz Band-Aid's stu..."
1,Just going to grab some raw fish................
2,"saw HP6... funny, lots of awesome awkward sile..."
3,Need to re-learn my patterns again... awesome...
4,FREE SLURPEE DAY!!!
5,has GOT to stop waking up at 1pm...
6,is not feeling exactly top-notch...
7,will have too many notifications at the next l...
8,First day of school done! finances are going t...
9,is tired and for some reason is looking forwar...


In [187]:
type(status_data)

pandas.core.frame.DataFrame

In [188]:
status_data.ix[0:4675,0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


0       is stuck on Band-Aid brand, cuz Band-Aid's stu...
1        Just going to grab some raw fish................
2       saw HP6... funny, lots of awesome awkward sile...
3        Need to re-learn my patterns again... awesome...
4                                     FREE SLURPEE DAY!!!
5                     has GOT to stop waking up at 1pm...
6                     is not feeling exactly top-notch...
7       will have too many notifications at the next l...
8       First day of school done! finances are going t...
9       is tired and for some reason is looking forwar...
10                      2 months of hell yet again... FML
11      got to play with fire in front of a crowd... Y...
12                            is in love with Ren Fest...
13                                    3 DAYS!!!!!!!!!!!!!
14                                    What a great day!!!
15        loves the Pirate English version of Facebook!!!
16                                        has a choice...
17            

In [223]:
# Split into training and test data: 66% and 33%
from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(status_data, test_size=0.50)

train_data = status_data.ix[0:3500-1,0]
test_data = status_data.ix[3500-1:4998,0]

train = train_data.values
test = test_data.values

In [206]:
train_labels_new = numpy.zeros((3500))
test_labels_new = numpy.zeros((1500))
for i in range(3500):
    train_labels_new[i] = 1
for i in range(1500):
    test_labels_new[i] = 0

In [207]:
len(train_labels_new)

3500

In [230]:
len(train_data)

3500

In [231]:
len(test_data)

1500

In [229]:
len(test_labels_new)

1500

In [232]:
train_data

0       is stuck on Band-Aid brand, cuz Band-Aid's stu...
1        Just going to grab some raw fish................
2       saw HP6... funny, lots of awesome awkward sile...
3        Need to re-learn my patterns again... awesome...
4                                     FREE SLURPEE DAY!!!
5                     has GOT to stop waking up at 1pm...
6                     is not feeling exactly top-notch...
7       will have too many notifications at the next l...
8       First day of school done! finances are going t...
9       is tired and for some reason is looking forwar...
10                      2 months of hell yet again... FML
11      got to play with fire in front of a crowd... Y...
12                            is in love with Ren Fest...
13                                    3 DAYS!!!!!!!!!!!!!
14                                    What a great day!!!
15        loves the Pirate English version of Facebook!!!
16                                        has a choice...
17            

In [233]:
test_data

3499    is about to go for a thankgiving hike with her...
3500    is thankful for a warm fire, good wine and her...
3501    downtown or international party? hmmm decision...
3502    can't believe she's leaving the lodge in the m...
3503    how can I be expected to work when there is so...
3504    A simple smile can change a day, an understand...
3505    had a really stressful day at work but got a 4...
3506       just finished her last quarter at UCSB. Crazy.
3507    is very happy with her grades and even happier...
3508    come with me, come with me, we'll travel to in...
3509    folk and tribal arts festival at the museum of...
3510    is exhausted but so ready to see her ladies to...
3511    just got back from the reel rock film show and...
3512    is applying for the Peace Corps//Master's Inte...
3513    you don't need no reason nor a three peice sui...
3514    is overwhelmed by data... and this is supposed...
3515    finally got her computer back in shape! Open o...
3516    roast 

In [234]:
# vectorize the text samples into a 2D integer tensor
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Sequential

tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(train_data)
sequences_train = tokenizer.texts_to_sequences(train_data)
sequences_test = tokenizer.texts_to_sequences(test_data)



In [235]:
len(train_labels)

3500

In [236]:
print('Pad sequences (samples x time)')
from keras.preprocessing import sequence
X_train_new = sequence.pad_sequences(sequences_train, maxlen=maxlen)
X_test_new = sequence.pad_sequences(sequences_test, maxlen=maxlen)

Y_train = np_utils.to_categorical(train_labels_new, nb_classes)
Y_test = np_utils.to_categorical(test_labels_new, nb_classes)


print('X_train_new shape:', X_train_new.shape)
print('X_test_new shape:', X_test_new.shape)

Pad sequences (samples x time)
X_train_new shape: (3500, 80)
X_test_new shape: (1500, 80)


In [238]:
# Epoch = 1
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train_new, Y_train, batch_size=batch_size, nb_epoch=1,
          validation_data=(X_test_new, Y_test))
score, acc = model.evaluate(X_test_new, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds = model.predict_classes(X_test, verbose=0)

Build model...


  app.launch_new_instance()


Train...




Train on 3500 samples, validate on 1500 samples
Epoch 1/1
Test score: 4.715363881429036
Test accuracy: 0.5999999642372131
Generating test predictions...


In [239]:
# Epoch = 100
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train_new, Y_train, batch_size=batch_size, nb_epoch=100,
          validation_data=(X_test_new, Y_test))
score, acc = model.evaluate(X_test_new, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds = model.predict_classes(X_test, verbose=0)

Build model...


  app.launch_new_instance()


Train...




Train on 3500 samples, validate on 1500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

In [240]:
# Epoch = 1
# Expanding batch size
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train_new, Y_train, batch_size=64, nb_epoch=1,
          validation_data=(X_test_new, Y_test))
score, acc = model.evaluate(X_test_new, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds = model.predict_classes(X_test, verbose=0)

Build model...




Train...




Train on 3500 samples, validate on 1500 samples
Epoch 1/1
Test score: 4.318496852874756
Test accuracy: 0.5999999642372131
Generating test predictions...


In [241]:
# Epoch = 1
# Without dropout
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train_new, Y_train, batch_size=64, nb_epoch=1,
          validation_data=(X_test_new, Y_test))
score, acc = model.evaluate(X_test_new, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds = model.predict_classes(X_test, verbose=0)

Build model...
Train...




Train on 3500 samples, validate on 1500 samples
Epoch 1/1
Test score: 4.780679239908854
Test accuracy: 0.5999999642372131
Generating test predictions...


In [244]:
# Epoch = 1
# Without dropout
print('Build model...')
model = Sequential()
model.add(Embedding(20000, 64))
model.add(LSTM(64)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train_new, Y_train, batch_size=64, nb_epoch=1,
          validation_data=(X_test_new, Y_test))
score, acc = model.evaluate(X_test_new, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds = model.predict_classes(X_test, verbose=0)

Build model...
Train...




Train on 3500 samples, validate on 1500 samples
Epoch 1/1
Test accuracy: 0.5999999642372131
Generating test predictions...
