In [5]:
# Python Modules
import os, math, random, copy, re
from collections import Counter, defaultdict

# SQL Modules
from sqlalchemy.orm import sessionmaker

# Scientific Modules
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.test.utils import common_texts

# My Modules
import scholar
from db import db_connect

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justincohler\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
engine = db_connect()
Session = sessionmaker(bind=engine)
session = Session()

In [8]:
papers_df = pd.read_sql_table("paper", engine)

In [9]:
papers_df.head()

Unnamed: 0,id,title,abstract,links,search_term
0,1,Scikit-learn: Machine learning in Python,Scikit-learn is a Python module integrating a ...,16626,
1,2,Pattern recognition and machine learning,1.2 Probability Theory . . . . . . . . . . . ....,35819,
2,3,Gaussian processes in machine learning,We give a basic introduction to Gaussian Proce...,14392,
3,4,Machine learning in automated text categorization,The automated categorization (or classificatio...,9207,
4,5,Machine learning,If computers could loam from experiencetheirus...,3591,


In [10]:
def lemmatize(wordnet, sentence):
       
    sentence = re.sub('[!?:.,;@#$]', '', sentence)
    words = nltk.word_tokenize(sentence)
    
    new_sentence = ""
    for word in words:
        new_sentence += wordnet.lemmatize(word) + " "
    
    return new_sentence.strip()     

In [11]:
# nltk.download() # Download models, corpus, etc.
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

papers_df["cleaned"] = papers_df.abstract.apply(lambda x: porter.stem(x))
papers_df.cleaned = papers_df.cleaned.apply(lambda x: lemmatize(wordnet, x))


In [12]:
papers_df.tail()

Unnamed: 0,id,title,abstract,links,search_term,cleaned
640,652,Toward harnessing user feedback for machine le...,There has been little research into how end us...,104,,there ha been little research into how end use...
641,653,Recent advances in predictive (machine) learning,Prediction involves estimating the unknown val...,100,,prediction involves estimating the unknown val...
642,654,Credit rating by hybrid machine learning techn...,It is very important for financial institution...,100,,it is very important for financial institution...
643,655,Machine learning algorithms for damage detecti...,The goal of this article is to detect structur...,142,,the goal of this article is to detect structur...
644,656,ADVISOR: A machine learning architecture for i...,"We have constructed ADVISOR, a two-agent machi...",118,,we have constructed advisor a two-agent machin...


# Create Word Embeddings

In [13]:
VOCAB_SIZE=5000

def tag_documents(row):
    return TaggedDocument(row["cleaned"], row.index)

documents = papers_df.apply(lambda row: tag_documents(row), axis=1)


In [15]:
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

  "C extension not loaded, training will be slow. "


In [16]:
model.infer_vector(["hello", "world"])

array([ 0.03058965,  0.02694721,  0.01240169, -0.01793071,  0.08863635],
      dtype=float32)

# Create Model

In [9]:
def split_XY(df):
    Y = df.links
    X = df.drop('links', axis=1)
    return X, Y

In [10]:
X, Y = split_XY(papers_df)

In [14]:
X.shape

(645, 5)

In [16]:
def baseline_model():
    model = Sequential()
    model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
seed = 42
numpy.random.seed(seed)

estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))