In [None]:
# General libraries
import re, os, string
import pandas as pd

# Scikit-learn importings
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ppaarxx","key":"cfc8b0ffbeedd3963977900322d31eae"}'}

In [None]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

rm: cannot remove '/root/.kaggle': No such file or directory


In [None]:
!kaggle datasets download -d rowhitswami/nips-papers-1987-2019-updated

Downloading nips-papers-1987-2019-updated.zip to /content
 87% 93.0M/106M [00:01<00:00, 86.8MB/s]
100% 106M/106M [00:01<00:00, 93.7MB/s] 


In [None]:
!kaggle datasets download -d rowhitswami/stopwords

Downloading stopwords.zip to /content
  0% 0.00/2.10k [00:00<?, ?B/s]
100% 2.10k/2.10k [00:00<00:00, 6.95MB/s]


In [None]:
import zipfile
with zipfile.ZipFile('nips-papers-1987-2019-updated.zip', 'r') as zip_ref:
    zip_ref.extractall('dataset')

In [None]:
import zipfile
with zipfile.ZipFile('stopwords.zip', 'r') as zip_ref:
    zip_ref.extractall('stop_file')

In [None]:
def get_stopwords_list(stop_file_path):
    """load stop words """

    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))

In [None]:
def clean_text(text):
    """Doc cleaning"""

    # Lowering text
    text = text.lower()

    # Removing punctuation
    text = "".join([c for c in text if c not in PUNCTUATION])

    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)

    return text

In [None]:
def sort_coo(coo_matrix):
    """Sort a dict with highest score"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""

    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    # word index and corresponding tf-idf score
    for idx, score in sorted_items:

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature, score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

In [None]:
def get_keywords(vectorizer, feature_names, doc):
    """Return top k keywords from a doc using TF-IDF method"""

    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)

    return list(keywords.keys())

In [None]:
PUNCTUATION = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
TOP_K_KEYWORDS = 10 # top k number of keywords to retrieve in a ranked document
STOPWORD_PATH = "/content/dataset/papers.csv"
PAPERS_PATH = "/content/dataset/papers.csv"

In [None]:
data = pd.read_csv(PAPERS_PATH)
data.head()

Unnamed: 0,source_id,year,title,abstract,full_text
0,27,1987,Bit-Serial Neural Networks,,573 \n\nBIT - SERIAL NEURAL NETWORKS \n\nAlan...
1,63,1987,Connectivity Versus Entropy,,1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser S...
2,60,1987,The Hopfield Model with Multi-Level Neurons,,278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...
3,59,1987,How Neural Nets Work,,442 \n\nAlan Lapedes \nRobert Farber \n\nThe...
4,69,1987,Spatial Organization of Neural Networks: A Pro...,,740 \n\nSPATIAL ORGANIZATION OF NEURAL NEn...


In [None]:
data.dropna(subset=['full_text'], inplace=True)

In [None]:
data['full_text'] = data['full_text'].apply(clean_text)

In [None]:
data.head()

Unnamed: 0,source_id,year,title,abstract,full_text
0,27,1987,Bit-Serial Neural Networks,,573 bit serial neural networks alan f murray a...
1,63,1987,Connectivity Versus Entropy,,1 connectivity versus entropy yaser s abumosta...
2,60,1987,The Hopfield Model with Multi-Level Neurons,,278 the hopfield model with mul tilevel neuron...
3,59,1987,How Neural Nets Work,,442 alan lapedes robert farber theoretical div...
4,69,1987,Spatial Organization of Neural Networks: A Pro...,,740 spatial organization of neural nenorks a p...


In [None]:
corpora = data['full_text'].to_list()

In [None]:
stopwords=get_stopwords_list(STOPWORD_PATH)

# Initializing TF-IDF Vectorizer with stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords, smooth_idf=True, use_idf=True)

# Creating vocab with our corpora
# Exlcluding first 10 docs for testing purpose
vectorizer.fit_transform(corpora[10::])

# Storing vocab
feature_names = vectorizer.get_feature_names_out()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
result = []
for doc in corpora[0:10]:
    df = {}
    df['full_text'] = doc
    df['top_keywords'] = get_keywords(vectorizer, feature_names, doc)
    result.append(df)

final = pd.DataFrame(result)
final

Unnamed: 0,full_text,top_keywords
0,573 bit serial neural networks alan f murray a...,"[arithmetic, 1987, analogdigital, accelerator,..."
1,1 connectivity versus entropy yaser s abumosta...,"[h2k, en2, edvb, limnoo, prnl, connectivity, n..."
2,278 the hopfield model with mul tilevel neuron...,"[qnn, hopfields, hopfield, defme, cnn, defmiti..."
3,442 alan lapedes robert farber theoretical div...,"[bumps, bump, eqn, chaotic, ridged, symbolic, ..."
4,740 spatial organization of neural nenorks a p...,"[queueing, stimulations, looping, propagative,..."
5,775 a neuralnetwork solution to the concentrat...,"[subarray, amplifier, hopfield, slack, assignc..."
6,642 learning by st ate recurrence detecfion br...,"[aseace, ase, cartpole, automaton, e2it, eligi..."
7,554 stability results for neural networks a n ...,"[attraction, subsystems, lyapunov, uit, satisf..."
8,804 introduction to a system for implementing ...,"[processors, simd, paths, slots, arc, routing,..."
9,474 optimiza non with artificial neural networ...,"[dipole, settling, dynamical, parasite, extrem..."


In [None]:
def user_input_test():
    text = input("Enter a sentence or paragraph for keyword extraction: ")
    return text

if __name__ == "__main__":
    user_text = user_input_test()
    keywords = get_keywords(vectorizer, feature_names, user_text)
    print("Extracted Keywords:", keywords)

Enter a sentence or paragraph for keyword extraction: RESIDUAL-CONCATENATE NEURAL NETWORK WITH DEEP REGULARIZATION LAYERS FOR BINARY CLASSIFICATION A PREPRINT Abhishek Gupta Research Scholar University of Mumbai Mumbai, MH, India abhishek.gupta20001@gmail.com Sruthi Nair Master of Engineering Vidyalankar Institute of Technology Mumbai, MH, India sruthi.rk.nair@gmail.com Raunak Joshi Mentor University of Mumbai Mumbai, MH, India raunakjoshi.m@gmail.com Vidya Chitre Assistant Professor - Department of IT Vidyalankar Institute of Technology Mumbai, MH, India vidya.chitre@vit.edu.in May 26, 2022 ABSTRACT Many complex Deep Learning models are used with different variations for various prognostication tasks. The higher learning parameters not necessarily ensure great accuracy. This can be solved by considering changes in very deep models with many regularization based techniques. In this paper we train a deep neural network that uses many regularization layers with residual and concatenation