In [None]:
import plotly.express as px
import plotly
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from nltk.tokenize.toktok import ToktokTokenizer
import spacy
import nltk
import re
from spacy.cli import download
import unicodedata
from nltk import regexp_tokenize

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import matutils, models, corpora, similarities
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis
from wordcloud import WordCloud
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
download('en_core_web_sm')
nlp=spacy.load('en_core_web_sm')
tokenizer=ToktokTokenizer()

In [None]:
picklefile=open('preprocessed_stack','rb')
stack_df=pickle.load(picklefile)
picklefile.close()

In [None]:
picklefile=open('stack_dataframe_cleaned-1','rb')
stack_df=pickle.load(picklefile)
picklefile.close()

In [None]:
stack_df.drop_duplicates(subset='title',inplace=True)

In [None]:
def split(txt):
    list1=txt.split('|')
    return list1
    #return txt.split('|')

In [None]:
stack_df['tags'][0].split('|') 

In [None]:
stack_df['list_tags']=stack_df['tags'].apply(lambda x: split(x))

In [None]:
stack_df.head(3)

In [None]:
stack_df[stack_df['views']>1000]

In [None]:
stack_df.columns

In [None]:
fig=px.histogram(stack_df[stack_df['reputation']>5000],x='reputation',title='Reputation Counts')
fig.show()
plotly.offline.plot(fig, filename='reputation_hist.jpg')

In [None]:
fig=px.histogram(stack_df,x='qa',title='Text Counts')
fig.show()

In [None]:
picklefile=open('stack_dataframe_cleaned-1','wb')
pickle.dump(stack_df,picklefile)
picklefile.close()
#stack_df.info()

In [None]:
parts_of_speech=['POS','ADP','ADV','AUX','CONJ','CCONJ','DET','INTJ','NUM','PART','PUNCT','SCONJ','SYM','X','SPACE']

In [None]:
#remove html tag and escaping code
def clean_html(txt):
    soup=BeautifulSoup(txt,"lxml")
    [x.extract() for x in soup.find_all('code')]
    [x.extract() for x in soup.find_all('script')]
    [x.extract() for x in soup.find_all('style')]
    [x.extract() for x in soup.find_all('meta')]
    [x.extract() for x in soup.find_all('noscript')]
    [x.extract() for x in soup.find_all(text=lambda text:isinstance(text, Comment))]
    res=soup.get_text()
    return soup.get_text()
def remove1(txt,nlp):
    list_text=[]
    txt1=nlp(txt)
    for word in txt1:
        if (word.pos_ not in parts_of_speech):
            if word.text != 'I':
                list_text.append(word.text)
    return list_text
def remove2(txt,nlp):
    list_text=[]
    txt1=nlp(txt)
    for word in txt1:
        if (word.pos_ not in parts_of_speech):
            if word.text != 'I':
                list_text.append(word.text)
    join_text=" ".join(list_text)
    return join_text

def remove_accented_chars(txt):
    txt=unicodedata.normalize('NFKD', txt).encode('ascii','ignore').decode('utf-8','ignore')
    return txt

def full_clean(txt,nlp):
    txt=remove2(txt,nlp)
    txt=txt.lower()
    #remove unicode characters
    txt=txt.encode('ascii','ignore').decode()
    #remove accent
    txt=remove_accented_chars(txt)
    #remove links
    txt=re.sub(r'http*\S+','',txt)
    txt=regexp_tokenize(txt,pattern=r"\s|[\.,;']", gaps=True)
    
    stop_words=stopwords.words("english")
    txt=[elem for elem in txt if elem not in stop_words]
    lemma=nltk.WordNetLemmatizer()
    #txt=lemma.lemmatize(txt)
    txt=[lemma.lemmatize(word) for word in txt]
    return txt

In [None]:
#txt1=nlp(stack_df['qa'][0])
stack_df['text']=stack_df['qa'].apply(lambda x:full_clean(x,nlp))

In [None]:
#n_topics=20'#min_df originally 10
tf_vect=CountVectorizer(max_df=0.8, min_df=5, stop_words='english',token_pattern=r'(?u)\b[A-Za-z]+\b')
#max_features=1000, 

In [None]:
tf=tf_vect.fit_transform(stack_df['qa']).toarray()
#tf_vect.get_feature_names()

In [None]:
X=stack_df['text']
y=stack_df['list_tags'].apply(lambda x: list(x))

In [None]:
vect=TfidfVectorizer(analyzer='word', max_df=.9,min_df=0.01,tokenizer=None,
                    preprocessor=' '.join,stop_words=None,lowercase=False)
vect.fit(X)
X_tfidf=vect.fit_transform(X).toarray()
#print(X_tfidf)
#len(vect.get_feature_names())
#print(f'shape of X for text: {X_tfidf.shape}')

In [None]:
#X_train,X_test,y_train,y_test=train_test_split(X_tfidf,y_bin,test_size=0.3,random_state=42)
#print(f'X_train shape: {X_train.shape}')
#print(f'X_test shape: {X_test.shape}')
#print(f'y_train shape: {y_train.shape}')
#print(f'y_test shape: {y_test.shape}')

In [None]:
n_topics=10
tfidf=X_tfidf
lda=LatentDirichletAllocation(n_components=n_topics, max_iter=5,learning_method='online',
                             learning_offset=50.,random_state=42)
lda.fit(tfidf)

In [None]:
# Display top n words for each topic identified
def display_topics(model, features, words_count):
    for topic_no, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_no))
        print(" ".join([features[i] for i in topic.argsort()[:-words_count - 1:-1]]))
def display_topics2(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(topic.argsort()[:-no_top_words - 1:-1])
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
   # return pd.DataFrame(topic_dict)
              
words_count=10
# Display top 10 words for each topic
#display_topics(lda, vect.get_feature_names(), words_count)
no_top_words = 20
#display_topics2(lda, vect.get_feature_names_out(), no_top_words)