In [90]:
# Python Modules
import os, math, random, copy, re
from collections import Counter, defaultdict

# SQL Modules
from sqlalchemy.orm import sessionmaker

# Scientific Modules
import pandas as pd
pd.set_option('max_colwidth', 20000)
import numpy as np

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.test.utils import common_texts

# My Modules
import scholar
from db import db_connect

import warnings
warnings.filterwarnings('ignore')

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justincohler\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [92]:
engine = db_connect()
Session = sessionmaker(bind=engine)
session = Session()

In [85]:
papers_df = pd.read_sql_table("paper", engine)

In [86]:
papers_df.head()

Unnamed: 0,id,title,abstract,links,search_term
0,1,Scikit-learn: Machine learning in Python,Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level …,16626,
1,2,Pattern recognition and machine learning,1.2 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . 12 1.2.1 Probabilitydensities . . . . . . . . . . . \n. . . . . . . . . . 17 1.2.2 Expectationsand covariances . . . . . . . . . . . . . . . . 19 1.2.3 Bayesianprobabilities \n. . . . . . . . . . . . . . . . . . . . 21 1.2.4 The Gaussian distribution . . . . . . . . . . . . . . . . . . 24 1.2.5 …,35819,
2,3,Gaussian processes in machine learning,We give a basic introduction to Gaussian Process regression models. We focus on understanding the role of the stochastic process and how it is used to define a distribution over functions. We present the simple equations for incorporating training data and examine …,14392,
3,4,Machine learning in automated text categorization,"The automated categorization (or classification) of texts into predefined categories has witnessed a booming interest in the last 10 years, due to the increased availability of documents in digital form and the ensuing need to organize them. In the research …",9207,
4,5,Machine learning,"If computers could loam from experiencetheirusefulness would be increased. When I write a clumsy program for a contemporary computer a thousand runs on the machine do not re-educate my handiwork. On every execution, each time-wastingblemish and crudity, each …",3591,


In [93]:
def lemmatize(wordnet, sentence):
       
    sentence = re.sub('[!?:.,;@#$]', '', sentence)
    words = nltk.word_tokenize(sentence)
    
    new_sentence = ""
    for word in words:
        new_sentence += wordnet.lemmatize(word) + " "
    
    return new_sentence.strip()     

In [94]:
# nltk.download() # Download models, corpus, etc.
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

papers_df["cleaned"] = papers_df.abstract.apply(lambda x: porter.stem(x))
papers_df.cleaned = papers_df.cleaned.apply(lambda x: lemmatize(wordnet, x))


In [89]:
papers_df.tail()

Unnamed: 0,id,title,abstract,links,search_term,cleaned
640,652,Toward harnessing user feedback for machine learning,"There has been little research into how end users might be able to communicate advice to machine learning systems. If this resource--the users themselves--could somehow work hand-in-hand with machine learning systems, the accuracy of learning systems could be …",104,,there ha been little research into how end user might be able to communicate advice to machine learning system if this resource -- the user themselves -- could somehow work hand-in-hand with machine learning system the accuracy of learning system could be …
641,653,Recent advances in predictive (machine) learning,Prediction involves estimating the unknown value of an attribute of a system under study given the values of other measured attributes. In prediction (machine) learning the prediction rule is derived from data consisting of previously solved cases. Most methods for predictive …,100,,prediction involves estimating the unknown value of an attribute of a system under study given the value of other measured attribute in prediction ( machine ) learning the prediction rule is derived from data consisting of previously solved case most method for predictive …
642,654,Credit rating by hybrid machine learning techniques,"It is very important for financial institutions to develop credit rating systems to help them to decide whether to grant credit to consumers before issuing loans. In literature, statistical and machine learning techniques for credit rating have been extensively studied. Recent studies …",100,,it is very important for financial institution to develop credit rating system to help them to decide whether to grant credit to consumer before issuing loan in literature statistical and machine learning technique for credit rating have been extensively studied recent study …
643,655,Machine learning algorithms for damage detection under operational and environmental variability,"The goal of this article is to detect structural damage in the presence of operational and environmental variations using vibration-based damage identification procedures. For this purpose, four machine learning algorithms are applied based on the auto-associative neural …",142,,the goal of this article is to detect structural damage in the presence of operational and environmental variation using vibration-based damage identification procedure for this purpose four machine learning algorithm are applied based on the auto-associative neural …
644,656,ADVISOR: A machine learning architecture for intelligent tutor construction,"We have constructed ADVISOR, a two-agent machine learning architecture for intelligent tutoring systems (ITS). The purpose of this architecture is to centralize the reasoning of an ITS into a single component to allow customization of teaching goals and to simplify …",118,,we have constructed advisor a two-agent machine learning architecture for intelligent tutoring system ( it ) the purpose of this architecture is to centralize the reasoning of an it into a single component to allow customization of teaching goal and to simplify …


In [96]:
papers_df.shape

(645, 6)

In [115]:
# Attribution to https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456

X_train = papers_df.loc[:400, 'cleaned'].values
y_train = papers_df.loc[:400, 'links'].values
X_test = papers_df.loc[400:, 'cleaned'].values
y_test = papers_df.loc[400:, 'links'].values

In [118]:
tokenizer = Tokenizer()
all_abstracts = np.hstack((X_train, X_test))
tokenizer.fit_on_texts(all_abstracts)

max_length = max([len(s.split()) for s in all_abstracts])

vocab_size = len(tokenizer.word_index) + 1

# Tokenize
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# Create Sentence Embedding From Training Corpus

In [13]:

def tag_documents(row):
    return TaggedDocument(row["cleaned"], row.index)

documents = papers_df.apply(lambda row: tag_documents(row), axis=1)


In [18]:
embedding_model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [75]:
def to_abstract_vectors(x):
    words = word_tokenize(x)
    return embedding_model.infer_vector(words)
    
papers_df["abstract_vector"] = papers_df.cleaned.apply(lambda x: to_abstract_vectors(x))

papers_df["abstract_vector_0"] = papers_df.abstract_vector.apply(lambda x: x[0])
papers_df["abstract_vector_1"] = papers_df.abstract_vector.apply(lambda x: x[1])
papers_df["abstract_vector_2"] = papers_df.abstract_vector.apply(lambda x: x[2])
papers_df["abstract_vector_3"] = papers_df.abstract_vector.apply(lambda x: x[3])
papers_df["abstract_vector_4"] = papers_df.abstract_vector.apply(lambda x: x[4])



In [76]:
papers_df.tail()

Unnamed: 0,id,title,abstract,links,search_term,cleaned,abstract_vector,abstract_vector_0,abstract_vector_1,abstract_vector_2,abstract_vector_3,abstract_vector_4
640,652,Toward harnessing user feedback for machine learning,"There has been little research into how end users might be able to communicate advice to machine learning systems. If this resource--the users themselves--could somehow work hand-in-hand with machine learning systems, the accuracy of learning systems could be …",104,,there ha been little research into how end user might be able to communicate advice to machine learning system if this resource -- the user themselves -- could somehow work hand-in-hand with machine learning system the accuracy of learning system could be …,"[-0.017252007, -0.099757805, -0.09665446, -0.005327185, 0.050420996]",-0.017252,-0.099758,-0.096654,-0.005327,0.050421
641,653,Recent advances in predictive (machine) learning,Prediction involves estimating the unknown value of an attribute of a system under study given the values of other measured attributes. In prediction (machine) learning the prediction rule is derived from data consisting of previously solved cases. Most methods for predictive …,100,,prediction involves estimating the unknown value of an attribute of a system under study given the value of other measured attribute in prediction ( machine ) learning the prediction rule is derived from data consisting of previously solved case most method for predictive …,"[-0.069251284, -0.09920634, 0.027527021, 0.11843491, -0.13849878]",-0.069251,-0.099206,0.027527,0.118435,-0.138499
642,654,Credit rating by hybrid machine learning techniques,"It is very important for financial institutions to develop credit rating systems to help them to decide whether to grant credit to consumers before issuing loans. In literature, statistical and machine learning techniques for credit rating have been extensively studied. Recent studies …",100,,it is very important for financial institution to develop credit rating system to help them to decide whether to grant credit to consumer before issuing loan in literature statistical and machine learning technique for credit rating have been extensively studied recent study …,"[0.052388754, -0.046408102, 0.030473558, 0.05587802, -0.003418506]",0.052389,-0.046408,0.030474,0.055878,-0.003419
643,655,Machine learning algorithms for damage detection under operational and environmental variability,"The goal of this article is to detect structural damage in the presence of operational and environmental variations using vibration-based damage identification procedures. For this purpose, four machine learning algorithms are applied based on the auto-associative neural …",142,,the goal of this article is to detect structural damage in the presence of operational and environmental variation using vibration-based damage identification procedure for this purpose four machine learning algorithm are applied based on the auto-associative neural …,"[-0.06589643, -0.070495375, -0.0664193, -0.058899924, -0.12803854]",-0.065896,-0.070495,-0.066419,-0.0589,-0.128039
644,656,ADVISOR: A machine learning architecture for intelligent tutor construction,"We have constructed ADVISOR, a two-agent machine learning architecture for intelligent tutoring systems (ITS). The purpose of this architecture is to centralize the reasoning of an ITS into a single component to allow customization of teaching goals and to simplify …",118,,we have constructed advisor a two-agent machine learning architecture for intelligent tutoring system ( it ) the purpose of this architecture is to centralize the reasoning of an it into a single component to allow customization of teaching goal and to simplify …,"[0.045146883, -0.06757521, -0.08072267, 0.0010922585, -0.031782392]",0.045147,-0.067575,-0.080723,0.001092,-0.031782


# Create Model

In [77]:
def split_XY(df):
    Y = df.links
    X = df[["abstract_vector_0", "abstract_vector_1", "abstract_vector_2", "abstract_vector_3", "abstract_vector_4"]]
    return X, Y

In [79]:
X, Y = split_XY(papers_df)
X.shape

(645, 5)

In [83]:
def baseline_model():
    model = Sequential()
    model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [84]:
seed = 42
np.random.seed(seed)

estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: -8517819.90 (20604223.79) MSE
