# Import required libraries

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from collections import Counter
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
from sklearn import preprocessing

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models

In [27]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

# Topic modeling

### LDA

Uploading wordnet

In [28]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kiril\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Importing data

In [34]:
df = pd.read_csv("train.csv", index_col=0)
df_oh = df[['cleared_text', 'subreddit']]

Lemmatization

In [35]:
%%time
lemmatizer = WordNetLemmatizer()
df_oh['cleared_text'] = df_oh['cleared_text'].map(lambda x: re.sub('\d+', '0', x))
df_oh['cleared_text'] = df_oh['cleared_text'].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split(" ")))

Wall time: 1min 13s


In [36]:
def clear_text(x):
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    return x

In [39]:
%%time
df_oh['cleared_text'] = df_oh['cleared_text'].apply(clear_text)

Wall time: 49.4 s


### Gensim LDA

Clearing Text

In [47]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [158]:
need_subreddit = ['SuicideWatch'] #'relationships', 'family', 'depression',

In [159]:
%%time
data_words = list(sent_to_words(df_oh[df_oh['subreddit'].apply(lambda x: x in need_subreddit)]['cleared_text']))

Wall time: 5.84 s


Building a bigram model

In [160]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [161]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

Lemmatization

In [162]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [163]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [164]:
%%time
data_words_bigrams = make_bigrams(data_words)
data_lemmatized = lemmatization(data_words_bigrams)

Wall time: 2min 5s


Creating a dictionary and corpus

In [165]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

LDA

In [166]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Wall time: 52.5 s


Keywords for each topic and weight

In [167]:
lda_model.print_topics()

[(0,
  '0.024*"get" + 0.017*"go" + 0.016*"year" + 0.014*"work" + 0.012*"job" + 0.010*"school" + 0.009*"take" + 0.008*"start" + 0.008*"time" + 0.008*"home"'),
 (1,
  '0.019*"amp" + 0.011*"woman" + 0.010*"people" + 0.008*"world" + 0.008*"look" + 0.008*"man" + 0.007*"human" + 0.006*"god" + 0.006*"death" + 0.006*"sex"'),
 (2,
  '0.021*"feel" + 0.020*"want" + 0.018*"do" + 0.017*"know" + 0.017*"m" + 0.016*"life" + 0.014*"get" + 0.014*"go" + 0.012*"think" + 0.011*"make"')]

Visualization of the topic and keywords

In [168]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

# Classification

In [144]:
df = pd.read_csv("final_embedded.csv", index_col=0)
df.head()

Unnamed: 0,score,num_comments,target,sin_hour,cos_hour,cos_month,sin_month,cos_weekday,sin_weekday,emb_text_0,...,emb_title_290,emb_title_291,emb_title_292,emb_title_293,emb_title_294,emb_title_295,emb_title_296,emb_title_297,emb_title_298,emb_title_299
0,1,12,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.010188,...,0.065674,-0.005625,-0.078449,0.006156,-0.005342,0.023279,-0.016699,0.035185,-0.006651,-0.001455
1,2,5,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.006287,...,-0.001185,0.026326,-0.153998,0.032865,0.015729,0.03636,0.023284,0.079431,-0.063423,0.024095
2,1,1,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,-0.008691,...,0.039523,-0.00256,-0.047848,0.014236,-0.011898,0.00909,-0.014194,0.092682,-0.009166,0.007
3,4,8,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.007517,...,0.040959,0.024521,-0.070278,0.005593,-0.000135,0.031898,0.027061,0.065442,-0.022263,-0.006768
4,0,9,8,-0.965926,0.258819,-0.866025,0.5,-0.222521,0.974928,0.003354,...,0.056048,-0.066131,-0.126724,0.002335,0.009897,0.033748,0.040027,0.051594,-0.029816,-0.057611


In [146]:
df = df[df['target'].apply(lambda x: x in(0,2,4,8))]

In [148]:
X, y = df.drop(columns=['target']), df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

### SVC

In [151]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('svc', SVC())
    ]
)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Wall time: 5min 33s


0.8275276973749875

### LogisticRegression

In [149]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('svc', LogisticRegression())
    ]
)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Wall time: 7.93 s


0.8443956482682903

### LDA

In [150]:
%%time
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('svc', LDA())
    ]
)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Wall time: 6.89 s


0.827128455933726

In [153]:
%%time
y_pred = pipe.predict(X_test)

Wall time: 0 ns


In [154]:
y_test

20810    0
85520    4
25424    0
78403    2
87126    4
        ..
89594    4
1999     8
88064    4
80184    4
2905     8
Name: target, Length: 10019, dtype: int64

In [155]:
from sklearn.metrics import accuracy_score

In [157]:
accuracy_score(y_test, y_pred)

0.8275276973749875