# Running SOM on Txt: Topic Detection, Sentiment Analysis

## TF-IDF model

In order to work with text, as described by (Sharma and Dey, 2013) or (Simoes, 2014), as well as the authors of the Java SOMToolbox at TU Wien [http://www.ifs.tuwien.ac.at/dm/somtoolbox/], text needs to be represented numerically (as descrbied in ifs' [Section on Text Representation](http://www.ifs.tuwien.ac.at/~andi/somlib/textrepresentation.html)), such as a Vector Space Model (VSM)  - using a bag-of-words approach or a more sofisticated term frequency - inverse document frequency (TF-IDF) model. 

### Method 1

In [6]:
from nltk import WordPunctTokenizer
from nltk import PorterStemmer
from nltk.corpus import stopwords
import gensim

In [7]:
def cleanDoc(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

In [9]:
tweets_all = open('Twitter_MINING/SOM-text/tweets_11_2017_all.txt')
dictionary = gensim.corpora.Dictionary(line.lower().split() for line in tweets_all)
print(dictionary)

Dictionary(31046 unique tokens: ['#bim', '#bimscotland', '#cad', '#math', '/']...)


In [11]:
tweets_clean = open('Twitter_MINING/SOM-text/tweets_11_2017_termsonly.txt') # removed links, @mentions and stopwords
dictionary = gensim.corpora.Dictionary(line.lower().split() for line in tweets_clean)
print(dictionary)

Dictionary(14351 unique tokens: ['##cpl17', '#10', '#11bienaldearquitetura', '#121seaport', '#123ddesign']...)


In [12]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('Twitter_MINING/SOM-text/tweets_11_2017_all.txt'):
            yield dictionary.doc2bow(line.lower().split())

In [15]:
corpus = MyCorpus()
gensim.corpora.MmCorpus.serialize('corpus.mm', corpus) # Save corpus to disk
corpus = gensim.corpora.MmCorpus('corpus.mm') # Load corpus
print(corpus)

MmCorpus(1018 documents, 14350 features, 74433 non-zero entries)


In [17]:
tfidf = gensim.models.TfidfModel(corpus)
print(tfidf)

TfidfModel(num_docs=1018, num_nnz=74433)


### Method 2

In [18]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from gensim.models import doc2vec
from collections import namedtuple

#### TF-IDF LOGISTIC REGRESSION

In [37]:
def get_tag_and_training_data(filename):
    '''takes the input file and returns  tokenized sentences and document tags as separate lists'''
    tags=[]
    documents=[]
    line_counter=1
    with open(filename) as f:
        for line in f:
            #skip first line
            if line_counter==1:
                line_counter=line_counter+1
                continue
            #Initialize the token list for line
            tags.append(line[:1])
            documents.append(line[2:])
    return tags,documents

In [38]:
Y,X=get_tag_and_training_data('Twitter_MINING/SOM-text/tweets_11_2017_all.txt')

In [39]:
#75:25 training test split
Y_train,Y_test=Y[:4120],Y[4120:]
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X)
freq_term_matrix = count_vectorizer.transform(X)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.transform(freq_term_matrix)

In [40]:
#train logistic regression model
X_train,X_test=tf_idf_matrix[:4120],tf_idf_matrix[4120:]
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train,Y_train)
pred=logreg.predict(X_test)
accuracy_score(Y_test, pred)

0.6886982311195848

In [44]:
# 2nd Category is BUSINESS NEWS,so lets test out a news peice on TESLA
logreg.predict(tfidf.transform((count_vectorizer.transform(["RT @Moneypenny: Massive congratulations to @AEWarchitects for their Corporate Workplace win at the #bcoawards. We absolutely ❤️ our… "]))))

array(['R'], dtype='<U1')

#### TF-IDF NAIVE BAYES

In [45]:
#initialize the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train,Y_train)
nb_pred=clf.predict(X_test)
accuracy_score(Y_test, nb_pred)

0.5727147547929244

#### Doc2Vec Logisitc Regression

In [49]:
# data already loaded as lists of sentences in X and Y

docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(X):
    words = text.lower().split()
    tags = [i]
    docs.append(analyzedDocument(words, tags))

# Train model (set min_count = 1, if you want the model to work with the provided example data set)
model = doc2vec.Doc2Vec(docs, vector_size = 160, window = 10, min_count = 7, workers = 4)

#making training and test sets
wb_Y_train,wb_Y_test=Y_train,Y_test
wb_X=[]
for i in range(len(X)):
    wb_X.append(model.docvecs[i])
wb_X_train=wb_X[:4120]
wb_X_test=wb_X[4120:]

#### Word Embeddings Logistic Regression

In [50]:
wb_logreg = linear_model.LogisticRegression(C=1e4)
wb_logreg.fit(wb_X_train,wb_Y_train)
wb_pred=wb_logreg.predict(wb_X_test)
accuracy_score(wb_Y_test, wb_pred)

0.5582035801292236

#### Word Embeddings Naive Bayes

In [47]:
wb_clf = GaussianNB()
wb_clf.fit(wb_X_train,wb_Y_train)
wb_nb_pred=wb_clf.predict(wb_X_test)
accuracy_score(wb_Y_test, wb_nb_pred)

NameError: name 'wb_X_train' is not defined

# Hopkins test for cluster tendency
Determines whether or not a data set contains clusters. The closer the output value is to 1, the higher is the cluster tendency.
from: https://matevzkunaver.wordpress.com/2017/06/20/hopkins-test-for-cluster-tendency/

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) # heuristic from article [1]
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print ujd, wjd
        H = 0
 
    return H