## Sources and Credits
This MLP model is based on Andrew Trask's Sentiment Analysis course on Udacity and the course materials he provided. 

## Load Data

In [1]:
g = open('mlptext.txt', 'r') #inputs
texts = list(map(lambda x:x[:-1].lower(),g.readlines()))
g.close()
g = open('mlptype.txt','r') #labels
types = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

## Tokenizing

In [2]:
from nltk.tokenize import TweetTokenizer
from string import punctuation

tknzr = TweetTokenizer()
quotes = []
for text in texts:
    tokens = tknzr.tokenize(text)
    quotes.append(' '.join([token for token in tokens if token not in punctuation])) 
texts = quotes

In [4]:
print("TYPE \t: TEXT")
for i in range(5):
    print(types[i] + "\t: " + texts[i])

TYPE 	: TEXT
MUNDANE	: the extensive literature addressed to the definition or characterization of science is filled with inconsistent points of view and demonstrates that an adequate definition is not easy to attain part of the difficulty arises from the fact that the meaning of science is not fixed but is dynamic as science has evolved so has its meaning it takes on a new meaning and significance with successive ages
VACUOUS	: even if you don't have all the things you want be grateful for the things you don't have that you don't want
VACUOUS	: you have to find the courage to live as you need to there will always be those who want you to be ordinary those who expect you to settle down your body can settle but you have to let your mind soar you have to hold onto the courage of your artistic convictions
VACUOUS	: a mother gives you a life a mother-in-law gives you her life
MUNDANE	: james shaw jr the man who wrested a rifle away from the waffle house gunman got emotional on tuesday when

## Stemming (optional)

In [5]:
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer(language='english')
quotes = []
for text in texts:
    quote = []
    for token in text.split(' '):
        stem = snowball.stem(token)
        quote.append(stem)
    quotes.append(' '.join(quote))
texts= quotes

In [6]:
print("TYPE \t: TEXT")
for i in range(5):
    print(types[i] + "\t: " + texts[i])

TYPE 	: TEXT
MUNDANE	: the extens literatur address to the definit or character of scienc is fill with inconsist point of view and demonstr that an adequ definit is not easi to attain part of the difficulti aris from the fact that the mean of scienc is not fix but is dynam as scienc has evolv so has it mean it take on a new mean and signific with success age
VACUOUS	: even if you don't have all the thing you want be grate for the thing you don't have that you don't want
VACUOUS	: you have to find the courag to live as you need to there will alway be those who want you to be ordinari those who expect you to settl down your bodi can settl but you have to let your mind soar you have to hold onto the courag of your artist convict
VACUOUS	: a mother give you a life a mother-in-law give you her life
MUNDANE	: jame shaw jr the man who wrest a rifl away from the waffl hous gunman got emot on tuesday when he receiv a stand ovat from the tennesse general assembl which offici recogn his heroism


## Lemmatizing (optional)

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
wnl = WordNetLemmatizer()

quotes = []
for text in texts:
    quote = []
    for word, tag in pos_tag(word_tokenize(text)):
        wntag = tag[0].lower()
        if wntag == 'j':
            wntag = 'a'
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wnl.lemmatize(word, wntag)
        quote.append(lemma)
    quotes.append(' '.join(quote))    
texts = quotes

In [8]:
print("TYPE \t: TEXT")
for i in range(5):
    print(types[i] + "\t: " + texts[i])

TYPE 	: TEXT
MUNDANE	: the extens literatur address to the definit or character of scienc be fill with inconsist point of view and demonstr that an adequ definit be not easi to attain part of the difficulti aris from the fact that the mean of scienc be not fix but be dynam as scienc have evolv so have it mean it take on a new mean and signific with success age
VACUOUS	: even if you do n't have all the thing you want be grate for the thing you do n't have that you do n't want
VACUOUS	: you have to find the courag to live as you need to there will alway be those who want you to be ordinari those who expect you to settl down your bodi can settl but you have to let your mind soar you have to hold onto the courag of your artist convict
VACUOUS	: a mother give you a life a mother-in-law give you her life
MUNDANE	: jame shaw jr the man who wrest a rifl away from the waffl hous gunman get emot on tuesday when he receiv a stand ovat from the tennesse general assembl which offici recogn his hero

## Inspect the data
can be skipped except ploting t-SNE in the end

In [30]:
from collections import Counter
import numpy as np

mundane_counts = Counter()
vacuous_counts = Counter()
total_counts = Counter()

for i in range(len(texts)):
    if(types[i] == 'MUNDANE'):
        for word in texts[i].split(" "):
            mundane_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in texts[i].split(" "):
            vacuous_counts[word] += 1
            total_counts[word] += 1

In [20]:
mundane_counts.most_common() #non-pseudo-profound

[('the', 27220),
 ('of', 15302),
 ('to', 11660),
 ('and', 11343),
 ('a', 11322),
 ('in', 10615),
 ('is', 6906),
 ('that', 5267),
 ('it', 4133),
 ('on', 3742),
 ('for', 3549),
 ('as', 3095),
 ('are', 2837),
 ('with', 2534),
 ('be', 2530),
 ('was', 2448),
 ('at', 2432),
 ('we', 2314),
 ('i', 2250),
 ('not', 2213),
 ('this', 2182),
 ('by', 2116),
 ('have', 2057),
 ('you', 2057),
 ('from', 2045),
 ('but', 1905),
 ('an', 1739),
 ('has', 1635),
 ('they', 1596),
 ('or', 1582),
 ('science', 1499),
 ('he', 1468),
 ('all', 1462),
 ('one', 1446),
 ('his', 1429),
 ('which', 1388),
 ('their', 1322),
 ('people', 1307),
 ('will', 1278),
 ('who', 1244),
 ('more', 1220),
 ('our', 1181),
 ('what', 1128),
 ('there', 1099),
 ('can', 1096),
 ('if', 1092),
 ('no', 1036),
 ('were', 1016),
 ('when', 1013),
 ('than', 1012),
 ('world', 973),
 ('its', 952),
 ('us', 946),
 ('new', 908),
 ('been', 908),
 ('so', 886),
 ('about', 883),
 ('after', 838),
 ('”', 837),
 ('time', 798),
 ('first', 775),
 ('into', 766),
 (

In [21]:
vacuous_counts.most_common() #pseudo-profound

[('the', 11901),
 ('to', 8982),
 ('you', 8666),
 ('and', 7176),
 ('of', 6057),
 ('is', 5657),
 ('a', 5507),
 ('in', 4129),
 ('that', 3758),
 ('it', 3708),
 ('your', 3607),
 ('i', 3507),
 ('be', 2779),
 ('are', 2576),
 ('we', 2359),
 ('not', 2339),
 ('for', 2271),
 ('have', 1895),
 ('life', 1862),
 ('what', 1727),
 ('with', 1714),
 ('but', 1694),
 ('will', 1551),
 ('can', 1523),
 ('if', 1495),
 ('all', 1444),
 ('as', 1314),
 ('when', 1288),
 ('do', 1279),
 ('on', 1261),
 ('or', 1238),
 ('my', 1191),
 ('love', 1145),
 ('they', 1126),
 ('who', 1057),
 ('our', 1041),
 ('one', 1021),
 ('no', 985),
 ("don't", 969),
 ('there', 945),
 ('me', 937),
 ('this', 934),
 ('people', 897),
 ('never', 868),
 ('from', 805),
 ('world', 801),
 ('so', 791),
 ("it's", 786),
 ('by', 781),
 ('only', 779),
 ('at', 774),
 ('yourself', 771),
 ('like', 749),
 ('things', 732),
 ('make', 721),
 ('us', 719),
 ('them', 682),
 ('about', 668),
 ('know', 667),
 ('more', 667),
 ('how', 659),
 ('just', 655),
 ('because', 6

## Printing the most 2k common words in pseudo-profound (vacuous) inputs

In [32]:
vacuous  = vacuous_counts.most_common(2000)
vacuous_pos = pos_tag([v[0] for v in vacuous])
g = open('vacuous_wordlist.txt','w')

for i in range (0,2000):
    g.write(vacuous[i][0] + '\t' +  vacuous_pos[i][1] + '\t' + str(vacuous[i][1]) + '\n')
g.close()

## Printing the most 2k common words in non-pseudo-profound (mundane) inputs

In [33]:
mundane  = mundane_counts.most_common(2000)
mundane_pos= pos_tag([m[0] for m in mundane])
f = open('mundane_wordlist.txt','w')

for i in range (0,2000):
    f.write(mundane[i][0] + '\t' +  mundane_pos[i][1] + '\t' + str(mundane[i][1]) + '\n')
f.close()

## Inspect the ratios of mundane to vacuous of a certain word
(Needs to be run for later t-SNE plotting)

In [31]:
mundane_vacuous_ratios = Counter()
mundane_vacuous_raw_ratios = Counter()
for word, cnt in list(total_counts.most_common()):
    if(cnt > 10):
        mundane_vacuous_ratio = mundane_counts[word] / float(vacuous_counts[word]+1)
        mundane_vacuous_ratios[word] = mundane_vacuous_ratio
        
mundane_vacuous_raw_ratios = mundane_vacuous_ratios #saving the raw ratios before taking logarithms

In [23]:
print("MUN / QUO raw ratio for 'possible' = {}".format(mundane_vacuous_ratios["possible"]))
print("MUN / QUO raw ratio for 'believe' = {}".format(mundane_vacuous_ratios["believe"]))

MUN / QUO raw ratio for 'possible' = 2.2739726027397262
MUN / QUO raw ratio for 'believe' = 0.8390804597701149


In [36]:
for word, ratio in mundane_vacuous_ratios.most_common():
    mundane_vacuous_ratios[word] = np.log(ratio + 0.01)

In [25]:
print("MUN / QUO log ratio for 'possible' = {}".format(mundane_vacuous_ratios["possible"]))
print("MUN / QUO log ratio for 'believe' = {}".format(mundane_vacuous_ratios["believe"]))

MUN / QUO log ratio for 'possible' = 0.8259162964239763
MUN / QUO log ratio for 'believe' = -0.16360132711393735


In [26]:
mundane_vacuous_ratios.most_common()

[('president', 6.385211262261948),
 ('trump', 5.945446786274408),
 ('scientific', 5.921605228935918),
 ('police', 5.488979047616993),
 ('thursday', 5.39367299986479),
 ('science', 5.366690045755508),
 ('donald', 5.164843115148066),
 ('florida', 5.043489630967203),
 ('political', 5.004013417787369),
 ('obama', 4.997279839049098),
 ('california', 4.962914557884844),
 ('festival', 4.962914557884844),
 ('park', 4.87527365616558),
 ('members', 4.8041030085872425),
 ('protesters', 4.78757507264335),
 ('military', 4.770769366637658),
 ('residents', 4.753676394287315),
 ('national', 4.72304210658298),
 ('china', 4.709620287344555),
 ('politics', 4.654055583717902),
 ('minister', 4.654055583717902),
 ('korea', 4.634826070895602),
 ('ceremony', 4.615219521841093),
 ('york', 4.610257225036649),
 ('korean', 4.605270180988425),
 ('england', 4.574814065973192),
 ('wednesday', 4.564452352709533),
 ('british', 4.564452352709533),
 ('india', 4.553982149218658),
 ('former', 4.532707014254376),
 ('offici

In [27]:
list(reversed(mundane_vacuous_ratios.most_common()))

[('stumble', -4.605170185988091),
 ('wishing', -4.605170185988091),
 ('darkest', -4.605170185988091),
 ('adversity', -2.962942450731),
 ('wanna', -2.962942450731),
 ('muse', -2.7704856720430024),
 ('flaws', -2.7704856720430024),
 ('blank', -2.724857319418591),
 ('rainbows', -2.676209595246551),
 ('yours', -2.676209595246551),
 ('regrets', -2.624168717121508),
 ('dont', -2.624168717121508),
 ('sorrow', -2.538744012420326),
 ('shine', -2.5204608533733968),
 ('mindful', -2.5080290672088545),
 ('authentic', -2.5080290672088545),
 ('dig', -2.4427317247372877),
 ('adventures', -2.4427317247372877),
 ('tranquility', -2.4427317247372877),
 ('empower', -2.371577964480997),
 ('roar', -2.371577964480997),
 ('crawl', -2.371577964480997),
 ('unconditional', -2.371577964480997),
 ('hurts', -2.371577964480997),
 ('forgive', -2.371577964480997),
 ("life's", -2.3412542341531997),
 ('yourself', -2.321942223683654),
 ('brilliance', -2.2935352574741277),
 ('aspire', -2.2935352574741277),
 ('worthwhile', -

## Printing the most 2000 polarized mundane and vacuous words

In [35]:
m_v_ratio  = mundane_vacuous_ratios.most_common(2000)
r_m_v_ratio = list(reversed(mundane_vacuous_ratios.most_common()))
m_v_ratio_pos= pos_tag([m[0] for m in m_v_ratio])
r_m_v_ratio_pos= pos_tag([m[0] for m in r_m_v_ratio])
f = open('mundane_vacuous_ratios.txt','w')
for i in range (0,2000):
    f.write(m_v_ratio[i][0] + '\t' +  m_v_ratio_pos[i][1] + '\t' + str(m_v_ratio[i][1]) + '\n')
f.close()
g = open('revserse_mundane_vacuous_ratios.txt','w')
for i in range (0,2000):
    g.write(r_m_v_ratio[i][0] + '\t' +  r_m_v_ratio_pos[i][1] + '\t' + str(r_m_v_ratio[i][1]) + '\n')
g.close()

## The MLP Initialization
The eval method returns chance of being pseudo-profound according to the trained model with the dataset loaded earlier on

In [5]:
import time
import sys
import numpy as np
from collections import Counter

class TextClassificationNetwork:
    
    def __init__(self, texts,types,min_count = 10, polarity_cutoff = 0.1, hidden_nodes = 10, learning_rate = 0.1):

        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the texts and their associated types 
        self.pre_process_data(texts, types, polarity_cutoff, min_count)
        
        # Build the network with the number of hidden nodes and the learning rate
        # Make the same number of input nodes as the size of vocabulary
        self.init_network(len(self.text_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, texts, types, polarity_cutoff, min_count):
        
        mundane_counts = Counter()
        vacuous_counts = Counter()
        total_counts = Counter()

        for i in range(len(texts)):
            if(types[i] == 'MUNDANE'):
                for word in texts[i].split(" "):
                    mundane_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in texts[i].split(" "):
                    vacuous_counts[word] += 1
                    total_counts[word] += 1

        mundane_vacuous_ratios = Counter()

        for word, cnt in list(total_counts.most_common()):
            if(cnt >= 5): #threshold of frequency in the data
                mundane_vacuous_ratio = mundane_counts[word] / float(vacuous_counts[word] + 1)
                mundane_vacuous_ratios[word] = mundane_vacuous_ratio

        for word, ratio in mundane_vacuous_ratios.most_common():
            if(ratio > 1):
                mundane_vacuous_ratios[word] = np.log(ratio)
            else:
                mundane_vacuous_ratios[word] = -np.log((1 / (ratio + 0.01)))
                
        # populate text_vocab
        text_vocab = set()
        for text in texts:
            for word in text.split(" "):
                if(total_counts[word] > min_count):
                    if(word in mundane_vacuous_ratios.keys()):
                        if((mundane_vacuous_ratios[word] >= polarity_cutoff) or (mundane_vacuous_ratios[word] <= -polarity_cutoff)):
                            text_vocab.add(word)
                    else:
                        text_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.text_vocab = list(text_vocab)
        
        # populate type_vocab
        type_vocab = set()
        for type in types:
            type_vocab.add(type)
        
        # Convert the type vocabulary set to a list so we can access types via indices
        self.type_vocab = list(type_vocab)
        
        # Store the sizes of the texts and types vocabularies
        self.text_vocab_size = len(self.text_vocab)
        self.type_vocab_size = len(self.type_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.text_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of types mapped to index positions
        self.type2index = {}
        for i, type in enumerate(self.type_vocab):
            self.type2index[type] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights between the input layer and the hidden layer.

        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    def get_target_for_type(self,type):
        if(type == 'MUNDANE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_texts_raw, training_types):

        training_texts = list()
        for text in training_texts_raw:
            indices = set()
            for word in text.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_texts.append(list(indices))

        assert(len(training_texts) == len(training_types))
        
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given texts and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_texts)):
            
            # Get the next text and its correct type
            text = training_texts[i]
            type = training_types[i]
            
            self.layer_1 *= 0
            for index in text:
                self.layer_1 += self.weights_0_1[index]

            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_type(type) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            # Only update the weights that were used in the forward pass
            for index in text:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and type == 'MUNDANE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and type == 'VACUOUS'):
                correct_so_far += 1
            
            # Print out our prediction accuracy and speed throughout the training process. 
            elapsed_time = float(time.time() - start)
            texts_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_texts)))[:4] \
                             + "% Speed(texts/sec):" + str(texts_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_texts, testing_types):

        correct = 0
        true_m = 0
        false_m = 0
        true_v = 0
        false_v = 0
        # Time how many predictions per second we make
        start = time.time()

        for i in range(len(testing_texts)):
            pred = self.run(testing_texts[i])
            if(pred == testing_types[i]):
                correct += 1
                if pred == 'MUNDANE':
                    true_m += 1
                else:
                    true_v += 1
            else:
                if pred == 'MUNDANE':
                    false_m += 1
                else:
                    false_v += 1
                
            # Print out our prediction accuracy and speed throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            texts_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_texts)))[:4] \
                             + "% Speed(texts/sec):" + str(texts_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%" \
                             + "\tTM:" + str(true_m) + "\tPM:" + str(false_m) + "\tTV:" + str(true_v) + "\tPV:" + str(false_v))     
            
    def run(self, text):

        self.layer_1 *= 0
        unique_indices = set()
        for word in text.split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
 
        if(layer_2[0] >= 0.5):
            return "MUNDANE"
        else:
            return "VACUOUS"
        
                
    def eval(self, eval_texts):
        pseudo_profundity = 0
        # Time how many predictions per second we make
        start = time.time()

        for i in range(len(eval_texts)):
            pred = self.run(eval_texts[i])
            if(pred != 'MUNDANE'):
                pseudo_profundity += 1
                
            # Print out our prediction accuracy and speeed throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            texts_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(eval_texts)))[:4] \
                             + "% Speed(texts/sec):" + str(texts_per_second)[0:5] \
                             + " Pseudo-profundity:" + str(pseudo_profundity * 100 / float(i+1))[:4] + "%")

## To-do: cross-validate each fold.

In [40]:
mlp = TextClassificationNetwork(texts[1850:], types[1850:], min_count=5, polarity_cutoff=0.1, learning_rate=0.01)
mlp.train(texts[1850:],types[1850:])

Progress:0.0% Speed(texts/sec):0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:15.0% Speed(texts/sec):2909. #Correct:2009 #Trained:2501 Training Accuracy:80.3%
Progress:30.0% Speed(texts/sec):2807. #Correct:4107 #Trained:5001 Training Accuracy:82.1%
Progress:45.0% Speed(texts/sec):2857. #Correct:6252 #Trained:7501 Training Accuracy:83.3%
Progress:60.0% Speed(texts/sec):2807. #Correct:8436 #Trained:10001 Training Accuracy:84.3%
Progress:75.0% Speed(texts/sec):2484. #Correct:10603 #Trained:12501 Training Accuracy:84.8%
Progress:90.0% Speed(texts/sec):2608. #Correct:12780 #Trained:15001 Training Accuracy:85.1%
Progress:99.9% Speed(texts/sec):2501. #Correct:14214 #Trained:16650 Training Accuracy:85.3%

In [41]:
mlp.test(texts[:1850],types[:1850])

Progress:0.0% Speed(texts/sec):0 #Correct:1 #Tested:1 Testing Accuracy:100.%	TM:1	PM:0	TV:0	PV:0Progress:0.05% Speed(texts/sec):0 #Correct:2 #Tested:2 Testing Accuracy:100.%	TM:1	PM:0	TV:1	PV:0Progress:0.10% Speed(texts/sec):0 #Correct:3 #Tested:3 Testing Accuracy:100.%	TM:1	PM:0	TV:2	PV:0Progress:0.16% Speed(texts/sec):0 #Correct:4 #Tested:4 Testing Accuracy:100.%	TM:1	PM:0	TV:3	PV:0Progress:0.21% Speed(texts/sec):0 #Correct:5 #Tested:5 Testing Accuracy:100.%	TM:2	PM:0	TV:3	PV:0Progress:0.27% Speed(texts/sec):0 #Correct:6 #Tested:6 Testing Accuracy:100.%	TM:2	PM:0	TV:4	PV:0Progress:0.32% Speed(texts/sec):0 #Correct:7 #Tested:7 Testing Accuracy:100.%	TM:3	PM:0	TV:4	PV:0Progress:0.37% Speed(texts/sec):0 #Correct:8 #Tested:8 Testing Accuracy:100.%	TM:4	PM:0	TV:4	PV:0Progress:0.43% Speed(texts/sec):0 #Correct:9 #Tested:9 Testing Accuracy:100.%	TM:5	PM:0	TV:4	PV:0Progress:0.48% Speed(texts/sec):0 #Correct:10 #Tested:10 Testing Accuracy:100.%	TM:5	PM:0	TV:5	PV:0Progress:0.54% Spe

Progress:67.4% Speed(texts/sec):6144. #Correct:1114 #Tested:1249 Testing Accuracy:89.1%	TM:596	PM:83	TV:518	PV:52Progress:67.5% Speed(texts/sec):6149. #Correct:1115 #Tested:1250 Testing Accuracy:89.2%	TM:597	PM:83	TV:518	PV:52Progress:67.5% Speed(texts/sec):6154. #Correct:1116 #Tested:1251 Testing Accuracy:89.2%	TM:598	PM:83	TV:518	PV:52Progress:67.6% Speed(texts/sec):6159. #Correct:1117 #Tested:1252 Testing Accuracy:89.2%	TM:598	PM:83	TV:519	PV:52Progress:67.6% Speed(texts/sec):6163. #Correct:1118 #Tested:1253 Testing Accuracy:89.2%	TM:599	PM:83	TV:519	PV:52Progress:67.7% Speed(texts/sec):6168. #Correct:1119 #Tested:1254 Testing Accuracy:89.2%	TM:600	PM:83	TV:519	PV:52Progress:67.7% Speed(texts/sec):6173. #Correct:1120 #Tested:1255 Testing Accuracy:89.2%	TM:600	PM:83	TV:520	PV:52Progress:67.8% Speed(texts/sec):6178. #Correct:1121 #Tested:1256 Testing Accuracy:89.2%	TM:601	PM:83	TV:520	PV:52Progress:67.8% Speed(texts/sec):6183. #Correct:1122 #Tested:1257 Testing Accuracy:89.2%

## mlpfull trains the complete dataset for eval method and word similarity plotting.

In [6]:
mlpfull = TextClassificationNetwork(texts, types, min_count=5, polarity_cutoff=0.1, learning_rate=0.01)
mlpfull.train(texts,types)

Progress:0.0% Speed(texts/sec):0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:13.5% Speed(texts/sec):3265. #Correct:2017 #Trained:2501 Training Accuracy:80.6%
Progress:27.0% Speed(texts/sec):3047. #Correct:4125 #Trained:5001 Training Accuracy:82.4%
Progress:40.5% Speed(texts/sec):3137. #Correct:6260 #Trained:7501 Training Accuracy:83.4%
Progress:54.0% Speed(texts/sec):3062. #Correct:8411 #Trained:10001 Training Accuracy:84.1%
Progress:67.5% Speed(texts/sec):2847. #Correct:10601 #Trained:12501 Training Accuracy:84.8%
Progress:81.0% Speed(texts/sec):2630. #Correct:12792 #Trained:15001 Training Accuracy:85.2%
Progress:94.5% Speed(texts/sec):2641. #Correct:14969 #Trained:17501 Training Accuracy:85.5%
Progress:99.9% Speed(texts/sec):2672. #Correct:15843 #Trained:18500 Training Accuracy:85.6%

In [35]:
g = open('to_eval.txt','r')
detects = list(map(lambda x:x[:-1].lower(),g.readlines()))
g.close()
#Stemming (optional)
'''
quotes = []
for detect in detects:
    tokens = tknzr.tokenize(detect)
    stems = []
    for token in tokens:
        stems.append(snowball.stem(token))
    tokens = stems
    quotes.append(' '.join([token for token in tokens if token not in punctuation]))
detects = quotes
'''

## Dectect chance of being pseudo-profound from to_eval.txt

In [23]:
mlpfull.eval(detects)

Progress:0.0% Speed(texts/sec):0 Pseudo-profundity:0.0%Progress:14.2% Speed(texts/sec):0 Pseudo-profundity:50.0%Progress:28.5% Speed(texts/sec):0 Pseudo-profundity:33.3%Progress:42.8% Speed(texts/sec):0 Pseudo-profundity:50.0%Progress:57.1% Speed(texts/sec):0 Pseudo-profundity:60.0%Progress:71.4% Speed(texts/sec):0 Pseudo-profundity:66.6%Progress:85.7% Speed(texts/sec):0 Pseudo-profundity:71.4%

## Get the most similar words to a given word

In [6]:
def get_most_similar_words(focus):
    most_similar = Counter()

    for word in mlpfull.word2index.keys():
        most_similar[word] = np.dot(mlpfull.weights_0_1[mlpfull.word2index[word]],mlpfull.weights_0_1[mlpfull.word2index[focus]])
    
    return most_similar.most_common()

In [7]:
get_most_similar_words("society")

[('science', 0.08063560221902141),
 ('scientific', 0.033385064638252145),
 ('technology', 0.024457570442375925),
 ('history', 0.023313607601492946),
 ('president', 0.023032889531040553),
 ('”', 0.02277717053487963),
 ('scientists', 0.022227519466824875),
 ('universe', 0.021947247292855066),
 ('scientist', 0.021028346073836888),
 ('during', 0.01908708106180083),
 ('said', 0.018647157425357413),
 ('years', 0.018064521224059438),
 ('was', 0.01746151015318839),
 ('human', 0.016801524322413816),
 ('says', 0.016787687481069114),
 ('very', 0.01610091066971842),
 ('—', 0.015866576291924527),
 ('over', 0.015696268152969237),
 ('its', 0.015401185570799164),
 ('politics', 0.015314018384244017),
 ('which', 0.015133755195603739),
 ('knowledge', 0.015115885935917998),
 ('system', 0.014969168975297513),
 ('question', 0.014737821457209888),
 ('of', 0.014700724309389193),
 ('these', 0.0146807564208591),
 ('evolution', 0.014677869382017863),
 ('from', 0.014608837775710337),
 ('after', 0.0143473746759528

In [8]:
get_most_similar_words("lost")

[('love', 0.01262683587608561),
 ('soul', 0.010198816931999324),
 ('yourself', 0.009505424248856234),
 ('your', 0.009311224765382424),
 ('life', 0.00923251682972654),
 ('heart', 0.00922867703426784),
 ('you', 0.008554493011186846),
 ('beautiful', 0.007548746673329846),
 ('dreams', 0.007162872360166512),
 ('inspiration', 0.006738796785294829),
 ('happiness', 0.006617455659450741),
 ('beauty', 0.006433797130850147),
 ('my', 0.006395525761033005),
 ('god', 0.0062381245078870235),
 ('never', 0.005889087014281816),
 ('success', 0.005664328486856486),
 ('dream', 0.005567174730338476),
 ('me', 0.005555345270125952),
 ('joy', 0.005342789718950904),
 ('others', 0.005023460785727064),
 ('smile', 0.004943884019311361),
 ('world', 0.004937075730775347),
 ('let', 0.004755243654331209),
 ('each', 0.004595928555852253),
 ('light', 0.004525819430861672),
 ('strength', 0.0043841922758023075),
 ('own', 0.004379860368972596),
 ('lives', 0.004297625544225322),
 ('courage', 0.004261106177540675),
 ('take',

## t-SNE Visualization for the 300 most polarized words

In [37]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in mundane_vacuous_ratios.most_common(300):
    if(word in mlpfull.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(mundane_vacuous_ratios.most_common()))[0:300]:
    if(word in mlpfull.word2index.keys()):
        words_to_visualize.append(word)

In [38]:
mundane = 0
vacuous = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in mundane_vacuous_ratios.keys():
        vectors_list.append(mlpfull.weights_0_1[mlpfull.word2index[word]])
        if(mundane_vacuous_ratios[word] > 0):
            mundane+=1
            colors_list.append("#a8b821")
        else:
            vacuous+=1
            colors_list.append("#e16666") 

In [40]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [41]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=5, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="10pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)
show(p)