In [1]:
#import needed packages

#import needed packages
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

import pandas as pd
import spacy
import en_core_web_sm
import re
import string
import os
import sys
import codecs
import argparse
import json
import gzip
from collections import Counter
import numpy as np

# Machine learning imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

# NLP imports
from spacy.lang.en import English
import nltk
from nltk.corpus import stopwords
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import Conv1D
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.backend import clear_session

In [2]:
#format the MyPersonality data

df = pd.read_csv ('MyPersonalityData.csv')

df = df.groupby('#AUTHID').agg({'cNEU':'first', 
                             'STATUS': ', '.join }).reset_index()

data = df.values.tolist()

In [3]:
#format the Essay data

df = pd.read_csv ('Essays.csv')
df = df.groupby('#AUTHID').agg({'cNEU':'first', 
                             str('TEXT'): ', '.join }).reset_index()

essay = df.values.tolist()

In [4]:
# Read from the data, returning parallel lists of documents and their labels

def read_and_clean_lines(data):   
    NeuScores = []
    Statuses = []

   
    for line in data:
        
        NeuScore = line[1]
        if NeuScore == 'y':
            NeuScore = 0
        else:
            NeuScore = 1
        
        Status = line[2]
        
        clean_text = re.sub(r"\s+"," ",Status)
        Statuses.append(clean_text)
        NeuScores.append(NeuScore)
        
    print("Read {} documents".format(len(Statuses)))
    print("Read {} labels".format(len(NeuScores)))
    return Statuses,NeuScores

In [5]:
# Read a set of stoplist words from data, assuming it contains one word per line
# Return a python Set data structure (https://www.w3schools.com/python/python_sets.asp)

def load_stopwords(filename):
    stopwords = []
    with codecs.open(filename, 'r', encoding='ascii', errors='ignore') as fp:
        stopwords = fp.read().split('\n')
    return set(stopwords)

In [6]:
# Call sklearn's train_test_split function to split the dataset into training items/labels and test items/labels.  

def split_training_set(lines, labels, test_size, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(lines, labels, test_size=test_size, random_state=random_seed, stratify=labels)
    print("Training set label counts: {}".format(Counter(y_train)))
    print("Test set     label counts: {}".format(Counter(y_test)))
    return X_train, X_test, y_train, y_test

In [7]:
###################################################################################################################
# Adaptation of the main logistic regression experiment from Assignment 2, to use cross-validation.
###################################################################################################################
def run_experiment(Feature_Extraction_Method, input, stopwords_file, test_size, num_folds, stratify, random_seed):

    # Load stopwords
    stop_words = load_stopwords(stopwords_file)

    #Clean training/test data
    X, y = read_and_clean_lines(input)
    Len_X = len(X)   
    
    #split MyPeronality data
    X_train, X_test, y_train, y_test = split_training_set(X, y, test_size=test_size)
    
    return X_train, X_test, y_train, y_test
   
   

In [34]:
X_train, X_test, y_train, y_test = run_experiment("LIWC", data, "mallet_en_stoplist.txt", 0.5, 5, False, 1) 

Read 250 documents
Read 250 labels
Training set label counts: Counter({1: 76, 0: 49})
Test set     label counts: Counter({1: 75, 0: 50})


In [35]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [36]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 400

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [37]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [38]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [56]:
#Convolutional Neural Network

model = Sequential()

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Conv1D(150, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 100)          991500    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 396, 150)          75150     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 150)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 151       
Total params: 1,066,801
Trainable params: 75,301
Non-trainable params: 991,500
_________________________________________________________________
None


In [57]:
#Convolutional Neural Network

X_train = np.asarray(X_train).astype('float32')
y_train = np.asarray(y_train).astype('float32')
X_test = np.asarray(X_test).astype('float32')
y_test = np.asarray(y_test).astype('float32')

clear_session()
history = model.fit(X_train, y_train, batch_size=150, epochs=25, verbose=1, validation_split=0.1)
score = model.evaluate(X_test, y_test, verbose=1)

print("Training Accuracy:", score[0])
print("Test Accuracy:", score[1])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Training Accuracy: 0.7269874215126038
Test Accuracy: 0.6000000238418579


In [49]:
# Recurrent Neural Network (LSTM)

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(LSTM(128))

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 100)          991500    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,108,877
Trainable params: 117,377
Non-trainable params: 991,500
_________________________________________________________________
None


In [50]:
# Recurrent Neural Network (LSTM)

clear_session()
history = model.fit(X_train, y_train, batch_size=150, epochs=25, verbose=1, validation_split=0.1)
score = model.evaluate(X_test, y_test, verbose=1)

print("Training Accuracy:", score[0])
print("Test Accuracy:", score[1])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Training Accuracy: 0.926833987236023
Test Accuracy: 0.5120000243186951


In [None]:
#https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/
#https://realpython.com/python-keras-text-classification/

In [60]:
for i in range(5000):
    start = random.randrange(1, 1000)
    end = random.randrange(start, 1000)
    print(start,end)
    

935 999
814 845
581 870
61 93
166 665
794 801
847 922
769 928
962 985
804 811
613 777
283 450
236 990
124 752
913 928
141 954
836 905
24 353
694 772
790 790
944 996
742 877
724 932
349 740
955 969
372 686
951 952
261 531
1 608
254 324
920 999
343 802
569 632
854 862
526 975
880 931
702 975
265 483
572 704
98 419
320 979
593 642
53 81
636 784
60 239
290 817
826 874
994 995
630 699
907 907
212 848
849 888
6 809
979 995
526 868
590 939
551 828
149 841
461 779
79 659
277 287
477 789
971 978
618 891
756 760
3 420
770 962
389 781
636 993
541 931
971 974
483 901
825 941
328 575
429 804
970 975
415 416
552 884
600 783
789 877
874 976
783 819
148 950
453 627
920 993
72 157
927 937
250 922
221 269
406 738
222 644
14 290
904 999
970 971
740 966
771 882
885 885
389 532
719 959
16 991
685 728
141 685
775 916
809 843
572 995
650 888
859 875
736 803
331 475
459 644
687 854
678 729
230 354
145 508
117 342
875 893
704 916
820 993
202 735
705 985
59 491
573 982
91 146
398 933
324 715
849 979
497 978
584