In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import numpy as np
import pandas as pd

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
df = pd.read_csv('/content/drive/MyDrive/MyTasks/30_days_projects/papers.csv')

In [6]:
df.shape

(7241, 7)

In [11]:
df.head(2)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...


In [8]:
# df['paper_text'][0]

In [9]:
stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using",
             "show", "result", "large",
             "also", "one", "two", "three",
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

In [10]:
def pre_process(text):

    # lowercase
    text=text.lower()

    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)

    ##Convert to list from string
    text = text.split()

    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]

    return ' '.join(text)

In [12]:
docs = df['paper_text'].iloc[:1000].apply(lambda x:pre_process(x))

In [13]:
docs.head()

Unnamed: 0,paper_text
0,self organization associative database applica...
1,mean field theory layer visual cortex applicat...
2,storing covariance associative long term poten...
3,bayesian query construction neural network mod...
4,neural network ensemble cross validation activ...


In [14]:
docs[0]

'self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding s

In [12]:
len(docs[0])

13350

In [None]:
Doc1 = 'The cat sat on the mat'
Doc2 = 'The dog sat on the mat'
Doc3 = 'The cat chased the mouse'

In [15]:
# TF-IDF

In [16]:
TF(cat)= 1/6
TF(the) = 2/6
TF(mat) = 1/6
TF(sat) = 1/6

SyntaxError: cannot assign to function call here. Maybe you meant '==' instead of '='? (<ipython-input-16-83ccb17cc115>, line 1)

In [18]:
# IDF(t,D) = log(Total number of doc)/number of docs contains the term t

In [19]:
# IDF(the) = log(3/3) = log(1) = 0
# - if the idf is less means the word is less important
# IDF(cat) = log(3/2) = log(1.5) = 0.18

# # IDF(mouse) = log(3/1) = log(3) 1.10

In [20]:
# TF-IDF(cat): TF(cat) * IDF(cat) = (1/6) * 0.18 = 0.03

In [17]:
(1/6) * 0.18

0.03

In [21]:
# Using TF-IDF
from sklearn.feature_extraction.text import CountVectorizer

#docs = docs.tolist()
#create a vocabulary of words,
cv=CountVectorizer(max_df=0.85,         # ignore words that appear in 85% of documents
                   max_features=1500,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
cv.fit(docs)
word_count_vector= cv.transform(docs)

In [24]:
# word_count_vector.toarray()

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [26]:
# Saving our class objects used for feature engineering:
import pickle

pickle.dump(cv, open('/content/drive/MyDrive/MyTasks/30_days_projects/keywords-count-vectorizer.pkl', 'wb'))
pickle.dump(tfidf_transformer, open('/content/drive/MyDrive/MyTasks/30_days_projects/keywords-tfidf-model.pkl', 'wb'))

In [27]:
def sort_coo(coo_matrix):
  tuples = zip(coo_matrix.col, coo_matrix.data)
  return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [29]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""

    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

In [31]:
# get feature names
feature_names=cv.get_feature_names_out()

In [32]:
pickle.dump(feature_names, open('/content/drive/MyDrive/MyTasks/30_days_projects/keywords-feature-names.pkl', 'wb'))

In [33]:
feature_names[230:234]

array(['computing', 'concept', 'conclusion', 'condition'], dtype=object)

In [34]:
def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    return keywords

In [35]:
def get_keywords_text(docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    return keywords

In [36]:
print(docs[225])

valid generalization size weight important size network peter bartlett department system engineering research school information science engineering australian national university canberra australia peter bartlettclanu edu abstract paper show neural network used pattern classification problem learning algorithm find network small weight small squared error training pattern generalization performance depends size weight rather number weight specifically consider layer feed forward network sigmoid unit sum magnitude weight associated unit bounded misclassification probability converges error estimate closely related squared error training set rate log ignoring log factor number training pattern input dimension constant may explain generalization performance neural network particularly number training example considerably smaller number weight support heuristic weight decay early stopping attempt keep weight small training introduction result statistical learning theory give bound number 

In [37]:
# This is my testing data.
testing = "valid generalization size weight important size network peter bartlett department system engineering research school information science engineering australian national university canberra australia peter bartlettclanu edu abstract paper show neural network used pattern classification problem learning algorithm find network small weight small squared error training pattern generalization performance depends size weight rather number weight specifically consider layer feed forward network sigmoid unit sum magnitude weight associated unit bounded misclassification probability converges error estimate closely related squared error training set rate log ignoring log factor number training pattern input dimension constant may explain generalization performance neural network particularly number training example considerably smaller number weight support heuristic weight decay early stopping attempt keep weight small training introduction result statistical learning theory give bound number training example necessary satisfactory generalization performance classification problem term vapnik chervonenkis dimension class function used learning system see example baum haussler used result give size bound multi layer threshold network generalization size weight neural network grow least quickly number weight see however pattern classification application bound seem loose neural network often perform successfully training set considerably smaller number weight paper show classification problem neural network perform well weight big size weight determines generalization performance contrast function class algorithm considered theory neural network used binary classification problem real valued output learning algorithm typically attempt minimize squared error network output training set well encouraging correct classification tends push output away zero towards target value easy see total squared error hypothesis example example hypothesis either incorrect sign magnitude le next section give misclassification probability bound hypothesis distinctly correct way example bound term scale sensitive version dimension called fat shattering dimension section give bound dimension feedforward sigmoid network imply main result proof sketched section full proof found full version notation bound misclassification probability denote space input pattern space label assume probability distribution product space reflects relative frequency different input pattern relative frequency expert classification pattern learning algorithm us class real valued function called hypothesis class hypothesis correct example sgn sgn take value iff misclassification probability error defined erp sgn crucial quantity determining misclassification probability fat shattering dimension hypothesis class say sequence point shattered iffunctions give classification sequence satisfying sgn dimension defined size largest shattered sequence given scale parameter say sequence point shattered sequence real value satisfying rdb fat shattering dimension denoted fath size largest shattered sequence dimension reflects complexity function class examined scale notice fath nonincreasing function following theorem give generalization error bound term fath related applies case error training set appear theorem define input space hypothesis class probability distribution let probability training sequence labelled fact according usual definition dimension class thresholded version function bartlett example every hypothesis satisfies erp xdl sgn log fathb comment informative compare standard bound case bound misclassification probability erp sgn ydl dlog log vcdim constant shall see next section function class vcdim infinite fathb finite example class function computed layer neural network arbitrary number parameter constraint size parameter known learning algorithm error estimate constrained make use considering proportion training example hypothesis misclassify distribution second term bound cannot improved log factor theorem show improved learning algorithm make use considering proportion training example correctly classified xdl possible give lower bound see full paper function class considered show theorem cannot improved log factor idea magnitude value give precise estimate generalization performance first proposed vapnik developed vapnik worker used case linear hypothesis class result give bound misclassification probability test term value training test data extended give bound misclassification probability unseen data term value training example extended general function class give error bound applicable hypothesis error training example lugosi pinter obtained bound misclassification probability term similar property class function containing true regression function conditional expectation given however result extend case true regression function class real valued function used estimator seems unnatural quantity specified advance theorem since depends example full paper give similar statement made uniform value quantity fat shattering dimension neural network bound dimensionofvarious neural network class established see review least linear number parameter section give bound fat shattering dimension several neural network class generalization size ofthe weight neural network assume input space subset define sigmoid unit function parametrized vector weight unit computes fixed bounded function satisfying lipchitz condition simplicity ignore offset parameter equivalent including extra input constant value multi layer feed forward sigmoid network depth network sigmoid unit single output unit arranged layered structure layer output unit pass input unit later layer consider network weight bounded relevant norm norm vector define iiwl iwil following give bound fatshattering dimension bounded linear combination real valued function term fat shattering dimension basis function class apply recursive fashion give bound single output feed forward network theorem let class function map define class weight bounded linear combination function wdi suppose fatfb log constant fathb gurvits koiran shown fat shattering dimension class layer network bounded output weight linear threshold hidden unit log lrn special case theorem improves notice fat shattering dimension function class changed constant factor compose function fixed function satisfying lipschitz condition like standard sigmoid function fathb logn finally fathb observation together theorem give following corollary notation suppresses log factor formally corollary class layer sigmoid network weight outp unit satisfying iiwlh fathb ilxli hidden unit weight bounded fathb log applying theorem give following deeper network notice constraint number hidden unit layer total magnitude weight associated processing unit corollary constant class depth sigmoid network weight vector associated unit beyond first layer satisfies iiwlll fathb iixlioo weight first layer unit satisfy iiwll fathb llog first part corollary network fat shattering dimension similar dimension linear network formalizes intuition weight small network operates linear part sigmoid behaves like linear network bartlett comment consider depth sigmoid network bounded weight last corollary theorem imply training size grows roughly misclassification probability network within proportion training example network classifies distinctly correct result give plausible explanation generalization performance neural network application network many unit small weight small squared error training example dimension hence number parameter important magnitude weight generalization performance possible give version theorem probability bound uniform value complexity parameter indexing function class technique mentioned end section case sigmoid network class indexed weight bound minimizing resulting bound misclassification probability equivalent minimizing sum error term penalty term involving weight bound support use popular heuristic technique weight decay early stopping see example aim minimize squared error maintaining small weight technique give bound fat shattering dimension hence generalization performance function class expressed bounded number composition either bounded weight linear combination scalar lipschitz function function class finite fat shattering dimension includes example radial basis function network proof proof sketch theorem pseudometric space set cover tin define size smallest cover define pseudometric dloo set function defined dloo max set function denote maxxex dloo noo alon obtained following bound noo term fat shattering dimension lemma class function map fatf log noo log log provided log define piecewise linear squashing function satisfying otherwise class real valued function define set composition function lemma theorem erp cnoo orsgn generalization size weight neural network proof lemma relies observation erp sgn ydl ydl use standard symmetrization argument permutation argument introduced vapnik chervonenkis bound probability probability random permutation double length related property hold fixed use pollard approach approximating hypothesis class cover except case appropriate cover respect pseudometric applying hoeffding inequality give lemma prove theorem need bound covering number term fatshattering dimension easy apply lemma quantized version function class get bound taking advantage range constraint imposed squashing function proof sketch theorem define pseudometric class function defined similarly define set function defined denote maxxexm similarly idea proof theorem first derive general upper bound covering number class apply following implicit proof theorem give bound fat shattering dimension lemma class valued function satisfying fatf log derive upper bound start bound lemma implies covering number noo class hidden unit function since implies following bound covering number provided satisfies condition required lemma turn theorem trivial otherwise log dlog emm log next use following approximation barron attribute maurey lemma maurey suppose hilbert space let element convex closure function iii lil implies element approximated particular accuracy respect fixed linear combination small number element follows construct cover cover lemma inequality show log emma dlog log bartlett jensen inequality implies give bound comparing lower bound given lemma solving give refined analysis neural network case involves bounding successive layer solving give bound fat shattering dimension network acknowledgement thanks andrew barron jonathan baxter mike jordan adam kowalczyk wee sun lee phil long john shawe taylor robert slaviero helpful discussion comment reference alon ben david cesa bianchi haussler scale sensitive mensions uniform convergence learn ability proceeding ieee symposium foundation computer science ieee press bartlett complexity pattern classification neural network size weight important size network technical report department system engineering australian national university available anonymous ftp syseng anu edu pub peter bartlett kulkarni posner covering number realvalued function class technical report australian national university princeton university baum haussler size net give valid generalization neural computation blumer ehrenfeucht haussler warmuth learnability vapnik chervorienkis dimension acm gurvits koiran approximation learning convex superposition computational learning theory eurocolt haussler decision theoretic generalization pac model neural net learning application inform comput hertz krogh palmer introduction theory neural computation addison wesley lugosi pinter data dependent skeleton estimate learning proc annu conference comput learning theory acm press new york maass vapnik chervonenkis dimension neural net arbib editor handbook brain theory neural network page mit press cambridge shawe taylor bartlett williamson anthony framework structural risk minimisation proc annu conference comput learning theory acm press new york shawe taylor bartlett williamson anthony structural risk minimization data dependent hierarchy technical report vapnik estimation dependence based empirical data springerverlag new york"

keywords_text = get_keywords_text(testing)

for k in keywords_text:
        print(k, keywords_text[k])

bound 0.38
class 0.301
dimension 0.25
weight 0.242
network 0.239
theorem 0.214
give 0.192
lemma 0.189
log 0.176
sigmoid 0.154


In [38]:
def print_results(idx,keywords, df):
    # now print the results
    print("Title")
    print(df['title'][idx])
    print("\nAbstract")
    print(df['abstract'][idx])
    print("\nKeywords for this text")
    for k in keywords:
        print(k,keywords[k])

In [41]:
idx=225
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)

Title
For Valid Generalization the Size of the Weights is More Important than the Size of the Network

Abstract
Abstract Missing

Keywords for this text
bound 0.38
class 0.301
dimension 0.25
weight 0.242
network 0.239
theorem 0.214
give 0.192
lemma 0.189
log 0.176
sigmoid 0.154
