In [1]:
# general
import pandas as pd
import numpy as np
from __future__ import division
from __future__ import print_function

# nlp
import gensim
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer   

#feature enginnering & clustering
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import AffinityPropagation

In [2]:
with open('StackOverflow.txt') as f:
    sents = f.readlines()
text = [x.strip() for x in sents]

In [3]:
data = pd.DataFrame({'text':text})

In [4]:
df = pd.DataFrame({'text':[]})
df.text= data.text

In [5]:
df.head()

Unnamed: 0,text
0,How do I fill a DataSet or a DataTable from a ...
1,How do you page a collection with LINQ?
2,Best Subversion clients for Windows Vista (64bit)
3,"Best Practice: Collaborative Environment, Bin ..."
4,Visual Studio Setup Project - Per User Registr...


In [6]:
text[0]

'How do I fill a DataSet or a DataTable from a LINQ query resultset ?'

In [7]:
df['text'][0]

'How do I fill a DataSet or a DataTable from a LINQ query resultset ?'

In [8]:
#preprocessing
'''
1. lower case
2. tyokenize:bag-of-words, n-grams
2. stemming,lemmatization
3. remove stopwords and punctuations
4. generate features: model in VSM: e.g.,tf-idf,co-occurence vector; wor2vec,sent2vec, doc2vec.
5. clustering
6. auto-tagging (topic extraction) based on clustering centroids
7. Evaluation
'''
# 1.remove punctuations
df['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-Z]+', ' ', x))
# 2.lowercase
df['text'] = df['text'].apply(lambda x: x.lower())
# 3.lemmatize
wnl = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: wnl.lemmatize(x))
# 4.tokenize
tokenizer = TweetTokenizer()
df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))
# 5. remove stop words
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])

In [9]:
# tf-idf
tfidf = TfidfVectorizer(stop_words = 'english')
X_text = tfidf.fit_transform(data['text'])

# reduce dimensions

svd = TruncatedSVD(n_components=20, random_state = 0)
X_2d = svd.fit_transform(X_text)

# fit k-mean clustering
kmeans = KMeans(n_clusters=20, random_state = 0)

# predict our clusters for each song
X_clustered = kmeans.fit_predict(X_2d)

In [20]:
X_2d

array([[ 0.11672334, -0.10728258,  0.24545832, ..., -0.04339518,
         0.01154854,  0.00940378],
       [ 0.19697021, -0.15010464,  0.22516146, ..., -0.0751901 ,
        -0.04147058,  0.07976761],
       [ 0.04918095,  0.01190245, -0.01406529, ..., -0.03167848,
        -0.0547745 , -0.01532782],
       ...,
       [ 0.11837154, -0.02279527, -0.13462921, ...,  0.01994234,
        -0.01191106, -0.01859824],
       [ 0.160087  , -0.06246806, -0.14404917, ...,  0.19670384,
         0.01752477, -0.02796627],
       [ 0.10745115, -0.04671286, -0.07798497, ..., -0.00615473,
        -0.0086188 , -0.02439176]])

In [10]:
# display by groups
df_plot = pd.DataFrame(list(X_2d), list(X_clustered))
df_plot = df_plot.reset_index()
df_plot.rename(columns = {'index': 'Cluster'}, inplace = True)
df_plot['Cluster'] = df_plot['Cluster'].astype(int)

print(df_plot.head())

print(df_plot.groupby('Cluster').agg({'Cluster': 'count'}))


   Cluster         0         1         2         3         4         5  \
0        5  0.116723 -0.107283  0.245458  0.126023 -0.041596 -0.006512   
1        5  0.196970 -0.150105  0.225161  0.262281 -0.041789 -0.020312   
2        6  0.049181  0.011902 -0.014065 -0.022295 -0.013936 -0.000844   
3       15  0.069343  0.012630 -0.032027 -0.038808 -0.030718 -0.008885   
4        3  0.159891  0.351006  0.058518  0.036197  0.037413 -0.002895   

          6         7         8    ...           10        11        12  \
0 -0.024742 -0.021847 -0.005373    ...     0.018872  0.018562 -0.000747   
1 -0.015387 -0.036667 -0.010797    ...     0.036786  0.036473 -0.028929   
2 -0.022409  0.020671 -0.016214    ...    -0.018265  0.054042  0.031884   
3 -0.071513  0.018801 -0.026893    ...    -0.036176  0.138263  0.144324   
4  0.010041 -0.008355 -0.001647    ...     0.023871 -0.016851 -0.000872   

         13        14        15        16        17        18        19  
0 -0.008361  0.017209 -0.05113

In [11]:
'''
TODO:
represent sents in vector space by word2vec model.
Try with different distace meatures.
'''
# 1. load word-embedding
model = gensim.models.KeyedVectors.load_word2vec_format('/work/courses/unix/T/ELEC/E5550/data/embeddings/GoogleNews-vectors-negative300.bin.gz',binary=True)

In [22]:
# 2. sent2vec/doc2vec
from tqdm import tqdm_notebook

def sent2vector(sent):
   # words = word_tokenize(sent.lower())
    words = sent
    # Here we weight-average each word in sentence by 1/log(count[word])
    emb = [model[w] for w in words if w in model]
    weights = [1./cnt[w] for w in words if w in model]
    
    if len(emb) == 0:
        return np.zeros(100, dtype=np.float32)
    else:
        return np.dot(weights, emb) / np.sum(weights)
from collections import Counter

cnt = Counter({k:v.count for k, v in model.vocab.items()})
X = np.array(list(map(sent2vector, tqdm_notebook(df['text']))))
df['sent_vectors'] = X




In [None]:
'''
TODO:
Evaluate the result using different methods:NMI,accuracy
'''
