In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import nltk
import spacy
import re
import string
import gc

In [49]:
test_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/test.csv')
test_data.head()

Unnamed: 0,id,title,content
0,1,What is spin as it relates to subatomic partic...,<p>I often hear about subatomic particles havi...
1,2,What is your simplest explanation of the strin...,<p>How would you explain string theory to non ...
2,3,"Lie theory, Representations and particle physics",<p>This is a question that has been posted at ...
3,7,Will Determinism be ever possible?,<p>What are the main problems that we need to ...
4,9,Hamilton's Principle,<p>Hamilton's principle states that a dynamic ...


In [50]:
def remove_html_tags(html):
    
    soup = BeautifulSoup(html,'lxml')
    text = soup.get_text()
    return text

stopwords = nltk.corpus.stopwords.words('english')
#print(stopwords)
def remove_urls(text):
    
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)
    return text

def remove_punctuation(text):
    
    chars = [char for char in text if char not in string.punctuation]
    text = ''.join([char for char in chars])
    return text

def remove_stopwords(text):
    
    text_lower = [x.lower() for x in text]
    text = ''.join([x for x in text_lower])
    tokens = nltk.word_tokenize(text)
    processed_text = [word for word in tokens if word not in stopwords]
    processed_text = ' '.join([word for word in processed_text])
    return processed_text

def lemmatize_text(text):
    
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    clean_text = ' '.join([word for word in lemmas])
    return clean_text

def remove_useless_words(text):
    
    tokens = nltk.word_tokenize(text)
    filter_tags = ['CD','FW','JJ','JJR','JJS','NN','NNP','NNPS','NNS','RB','RBR','RBS']
    pos_tags = []
    filtered_list = []
    for token in tokens:
        pos_tags.append(nltk.pos_tag(token))
    
    for (word,tag) in pos_tags:
        if tag in filter_tags:
            filtered_list.append(word)
            
    filtered_text = ' '.join([word for word in filtered_list])  
    return filtered_text
    

In [51]:
#test_data['content'] = test_data['content'].apply(lambda x:remove_useless_words(x))

In [52]:
test_data['content'] = test_data['content'].apply(lambda x:remove_html_tags(x))
test_data['content'] = test_data['content'].apply(lambda x:remove_punctuation(x))
test_data['content'] = test_data['content'].apply(lambda x:remove_stopwords(x))
test_data['content'] = test_data['content'].apply(lambda x:lemmatize_text(x))

In [53]:
test_data['title'] = test_data['title'].apply(lambda x:remove_html_tags(x))
test_data['title'] = test_data['title'].apply(lambda x:remove_punctuation(x))
test_data['title'] = test_data['title'].apply(lambda x:remove_stopwords(x))
test_data['title'] = test_data['title'].apply(lambda x:lemmatize_text(x))

In [55]:
data = test_data[:4000]
test_data.to_pickle('processed_df.pkl')

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform(list(data['content']))

In [73]:
# def get_freq(text):
#     vector = vectorizer.transform([text])
#     vector = vector[vector.nonzero()]
#     return vector
# data['vectors'] = data['content'].apply(lambda x:get_freq(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [10]:
import gensim
embeddings = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True,limit=600000)



In [80]:
keywords = []
for word,freq in vectorizer.vocabulary_.items():
    if len(word.split()) == 2:
        word = '_'.join([x for x in word.split()])
    keywords.append(word)
    
def get_tf_words(text):
    keyword_from_content = []
    tokens = nltk.word_tokenize(text)
    for word in tokens:
        if word in keywords:
            keyword_from_content.append(word)
    
    return keyword_from_content

In [81]:
data['keywords'] = data['content'].apply(lambda x:get_tf_words(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [85]:
#print(features.shape[0])
feature_names = vectorizer.get_feature_names()
for doc_index in range(4000):
    sampled_words = []
    feature_index = tfidf_matrix[doc_index,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if s >= 0.1:
            sampled_words.append(w)
            #print(w, s)
    data['keywords'][doc_index] = sampled_words

In [None]:
physics_vector = embeddings['physics']
def get_tags(keywords):
    sims = []
    for keyword in keywords:
        if keyword in embeddings.wv.vocab:
            keyword_vector = embeddings[keyword]
            sim = np.dot(physics_vector,keyword_vector.T)
            sims.append((sim,keyword))
    sims = list(set(sims))        
    sorted_sims = sorted(sims)
    tags = sorted_sims[-3:]
    tags = [tag[1] for tag in tags]
    return tags

In [56]:
data['tags'] = data['keywords'].apply(lambda x:get_tags(x))

  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [57]:
data.head()

Unnamed: 0,id,title,content,keywords,tags
0,1,spin relates subatomic particle,often hear subatomic particle property called ...,"[often, hear, subatomic, particle, property, c...","[spin, particle, subatomic]"
1,2,simplest explanation string theory,would explain string theory non physicist im s...,"[would, explain, string, theory, non, physicis...","[plausible, theory, physicist]"
2,3,lie theory representation particle physic,question posted many different forum thought m...,"[question, posted, many, different, forum, tho...","[symmetry, invariant, physicist]"
3,7,determinism ever possible,main problem need solve prove laplace determin...,"[main, problem, need, solve, prove, laplace, d...","[principle, solve, determinism]"
4,9,hamilton principle,hamilton principle state dynamic system always...,"[hamilton, principle, state, dynamic, system, ...","[dynamic, stationary, principle]"
