In [1]:
import re
import spacy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
df = pd.read_csv('/kaggle/input/nips-papers/papers.csv')

In [4]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [5]:
df['paper_text'][0]

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

In [6]:
df.shape

(7241, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7241 non-null   int64 
 1   year        7241 non-null   int64 
 2   title       7241 non-null   object
 3   event_type  2422 non-null   object
 4   pdf_name    7241 non-null   object
 5   abstract    7241 non-null   object
 6   paper_text  7241 non-null   object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [8]:
df.duplicated().any()

False

In [9]:
nlp = spacy.load('en_core_web_sm')

custom_stop_words = {
    "fig", "figure", "image", "sample", "using", "result", "large", "also", "one",
    "two", "three", "four", "five", "six", "seven", "eight", "nine"
}

for word in custom_stop_words:
    nlp.vocab[word].is_stop = True

In [10]:
def preprocess_tokenize(text):

    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    text = nlp(text)
    text = [
        token.text for token in text
        if not token.is_stop            # Remove stop words
        and not token.is_punct          # Remove punctuation
        and not token.like_url          # Remove URLs
        and not token.like_email        # Remove email addresses
        and not token.is_space          # Remove newlines and extra spaces
        and len(token.text) >= 3
    ]

    return " ".join(text)

In [11]:
example = "Contact me loving at example@example.com. Visit https://example.com. NLP and AI are amazing! #AI #NLP @user123. Price is 12.34 or 1.23e10."

In [12]:
preprocess_tokenize(example)

'contact loving example example com visit https example com nlp amazing nlp user price'

In [13]:
preprocess_tokenize(df['paper_text'][38])

'vlsi model primate visual smooth pursuit ralph etienne cummings jan van der spiegel department electrical engineering southern illinois university carbondale moore school electrical engineering university pennsylvania philadelphia paul mueller corticon incorporated market str philadelphia abstract dimensional model primate smooth pursuit mechanism implemented cmos vlsi model consolidates robinson negative feedback model wyatt pola positive feedback scheme produce smooth pursuit system zero velocity target retina furthermore system uses current eye motion predictor future target motion analysis stability biological correspondence system discussed implementation focal plane local correlation based visual motion detection technique velocity measurements ranging orders magnitude variation provides input smooth pursuit system system performed successful velocity tracking high contrast scenes circuit design performance complete smooth pursuit system presented introduction smooth pursuit mec

In [14]:
doc = df['paper_text'].apply(lambda x:preprocess_tokenize(x))

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = 0.95, max_features = 7242, ngram_range = (1,3))
word_count_vectors = cv.fit_transform(doc)

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer = tfidf_transformer.fit(word_count_vectors)

In [17]:
feature_names = cv.get_feature_names_out()

In [26]:
def get_keywords(idx,doc, topN = 10):
    
    docs_words_count = tfidf_transformer.transform(cv.transform([doc[idx]]))
    docs_words_count = docs_words_count.tocoo()
    
    tuples = zip(docs_words_count.col, docs_words_count.data)
    sorted_items = sorted(tuples, key = lambda x: (x[1], x[0]), reverse = True)
    
    sorted_items = sorted_items[:topN]
    
    score_vals = []
    features_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        features_vals.append(feature_names[idx])
    
    results = {}
    
    for idx in range(len(features_vals)):
        results[features_vals[idx]] = score_vals[idx]
        
    return results
        



def print_keywords(idx, keywords, df):
    print("\n==========title=========")
    print(df['title'][idx])
    print('\n==========Abstract======')
    print(df['abstract'][idx])
    print('\n==========keywords======')
    for k in keywords:
        print(k, keywords[k])
    
idx = 3338
keywords = get_keywords(idx, doc)
print_keywords(idx, keywords, df)


Latent Variable Models for Predicting File Dependencies in Large-Scale Software Development

When software developers modify one or more files in a large code base, they must also identify and update other related files. Many file dependencies can be detected by mining the development history of the code base: in essence, groups of related files are revealed by the logs of previous workflows. From data of this form, we show how to detect dependent files by solving a problem in binary matrix completion. We explore different latent variable models (LVMs) for this problem, including Bernoulli mixture models, exponential family PCA, restricted Boltzmann machines, and fully Bayesian approaches. We evaluate these models on the development histories of three large, open-source software systems: Mozilla Firefox, Eclipse Subversive, and Gimp. In all of these applications, we find that LVMs improve the performance of related file prediction over current leading methods.

files 0.778
file 0.19
s