# Import required libraries

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from collections import Counter
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
from sklearn import preprocessing

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models

# Import data

In [5]:
df = pd.read_csv("final_embedded.csv", index_col=0)
df.head()

Unnamed: 0,score,num_comments,target,sin_hour,cos_hour,cos_month,sin_month,cos_weekday,sin_weekday,emb_text_0,...,emb_title_290,emb_title_291,emb_title_292,emb_title_293,emb_title_294,emb_title_295,emb_title_296,emb_title_297,emb_title_298,emb_title_299
0,1,12,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.010188,...,0.065674,-0.005625,-0.078449,0.006156,-0.005342,0.023279,-0.016699,0.035185,-0.006651,-0.001455
1,2,5,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.006287,...,-0.001185,0.026326,-0.153998,0.032865,0.015729,0.03636,0.023284,0.079431,-0.063423,0.024095
2,1,1,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.008691,...,0.039523,-0.00256,-0.047848,0.014236,-0.011898,0.00909,-0.014194,0.092682,-0.009166,0.007
3,4,8,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.007517,...,0.040959,0.024521,-0.070278,0.005593,-0.000135,0.031898,0.027061,0.065442,-0.022263,-0.006768
4,0,9,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.003354,...,0.056048,-0.066131,-0.126724,0.002335,0.009897,0.033748,0.040027,0.051594,-0.029816,-0.057611


Division into training and test samples

In [3]:
X, y = df.drop(columns=['target']), df['target']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

# Classification

### SVC

In [10]:
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('svc', SVC())
    ]
)

In [11]:
%%time
pipe.fit(X_train, y_train)

Wall time: 23min 39s


Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])

In [12]:
pipe.score(X_test, y_test)

0.7194537613799713

### LogisticRegression

In [8]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('svc', LogisticRegression())
    ]
)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Wall time: 21.6 s


0.7266810413671937

### LDA

In [12]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('svc', LDA())
    ]
)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Wall time: 14.3 s


0.6905446414310813

# Topic modeling

### LDA

Uploading wordnet

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kiril\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Importing data

In [12]:
df = pd.read_csv("train.csv", index_col=0)
df_oh = df[['cleared_text', 'target']]

Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer()
df_oh['cleared_text'] = df_oh['cleared_text'].map(lambda x: re.sub('\d+', '0', x))
df_oh['cleared_text'] = df_oh['cleared_text'].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split(" ")))

Word frequencies using TfidfVectorizer

In [16]:
vectorizer = TfidfVectorizer()
X_tf_idf = vectorizer.fit_transform(df_oh['cleared_text'].tolist())

LDA

In [22]:
%%time
lda = LatentDirichletAllocation(n_components=10, random_state=1)
lda.fit(X_tf_idf)

Wall time: 5min 14s


LatentDirichletAllocation(random_state=1)

Top 10 words of each topic

In [24]:
vocab = vectorizer.get_feature_names()

n_top_words = 10

topic_words = {}

for topic, comp in enumerate(lda.components_):    
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    print([vocab[x] for x in word_idx],"\n")

['unvaccinated', 'fdh', 'tri', 'bronchitis', 'trey', 'neice', 'miralax', 'croup', 'zyprexa', 'hickies'] 

['im', 'feel', 'life', 'like', 'want', 'it', 'get', 'time', 'know', 'year'] 

['stacy', 'bridget', 'mj', 'therere', 'dysphoric', 'rita', 'bagger', 'jenn', 'kardashian', 'exfoliating'] 

['like', 'im', 'time', 'friend', 'know', 'want', 'love', 'feel', 'me', 'it'] 

['seth', 'puree', 'shouldt', 'dysplasia', 'hsv', 'friending', 'locus', 'synergy', 'hyperthyroidism', 'chapstick'] 

['christ', 'jesus', 'adrian', 'lofty', 'idealizing', 'audrey', 'josie', 'vashikaran', 'saya', 'discouragement'] 

['food', 'clean', 'eat', 'dish', 'meal', 'bra', 'clothes', 'laundry', 'kitchen', 'cheese'] 

['yeast', 'de', 'monistat', 'bv', 'que', 'diflucan', 'infection', 'para', 'nina', 'yi'] 

['que', 'de', 'la', 'en', 'por', 'nd', 'mi', 'samantha', 'ella', 'glucose'] 

['sleep', 'potty', 'baby', 'bed', 'nap', 'son', 'hair', 'daycare', 'toddler', 'old'] 



### Gensim LDA

Clearing Text

In [133]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [135]:
data_words = list(sent_to_words(df_oh['cleared_text']))

Building a bigram model

In [137]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [149]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

Lemmatization

In [147]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [148]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [182]:
%%time
data_words_bigrams = make_bigrams(data_words)
data_lemmatized = lemmatization(data_words_bigrams)

Creating a dictionary and corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

LDA

In [189]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Wall time: 11min 38s


Keywords for each topic and weight

In [190]:
lda_model.print_topics()

[(0,
  '0.113*"school" + 0.042*"college" + 0.034*"class" + 0.032*"high" + 0.030*"year" + 0.022*"study" + 0.017*"student" + 0.017*"parent" + 0.016*"university" + 0.016*"grade"'),
 (1,
  '0.054*"mom" + 0.039*"family" + 0.033*"mother" + 0.032*"kid" + 0.031*"dad" + 0.031*"child" + 0.030*"parent" + 0.026*"sister" + 0.024*"old" + 0.020*"brother"'),
 (2,
  '0.037*"friend" + 0.035*"woman" + 0.032*"guy" + 0.023*"sex" + 0.021*"girl" + 0.020*"relationship" + 0.018*"man" + 0.015*"date" + 0.013*"meet" + 0.011*"really"'),
 (3,
  '0.041*"go" + 0.037*"day" + 0.027*"get" + 0.022*"home" + 0.021*"night" + 0.020*"time" + 0.018*"work" + 0.016*"hour" + 0.015*"week" + 0.014*"sleep"'),
 (4,
  '0.025*"life" + 0.023*"people" + 0.013*"make" + 0.013*"feel" + 0.013*"love" + 0.010*"way" + 0.010*"thing" + 0.008*"think" + 0.008*"be" + 0.008*"good"'),
 (5,
  '0.025*"look" + 0.013*"walk" + 0.013*"body" + 0.011*"wear" + 0.010*"man" + 0.009*"hand" + 0.009*"face" + 0.008*"hair" + 0.008*"eye" + 0.007*"pain"'),
 (6,
  '0.30

Visualization of the topic and keywords

In [193]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

## Clustering

In [97]:
le = preprocessing.LabelEncoder().fit(y)

### Kmeans-fasttext

In [67]:
%%time
kmeans = KMeans(n_clusters=10).fit(X)
metrics.adjusted_rand_score(kmeans.predict(X), le.transform(y))

Wall time: 23.3 s


-0.0012596864771117007

### Kmeans-tfifd

In [68]:
%%time
km_tfidf=KMeans(n_clusters=10).fit(X_tf_idf)
metrics.adjusted_rand_score(km_tfidf.predict(X_tf_idf), le.transform(df_oh['target']))

0.6672182413997408

### MiniBatchKMeans

In [118]:
%%time
model = MiniBatchKMeans(n_clusters=10)
model.fit(X_tf_idf)
metrics.adjusted_rand_score(model.predict(X_tf_idf), le.transform(y))

Wall time: 9 s


0.5033175641491212