# 1. Importing libraries and dataset 

In [24]:
import pandas as pd 
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
import warnings

In [3]:
data_df=pd.read_csv("nips-papers/papers.csv")

In [4]:
data_df.drop(['pdf_name','event_type'],axis=1,inplace=True)

In [5]:
author_df = pd.read_csv('nips-papers/authors.csv')

In [6]:
data_df['title'] = data_df['title'].map(lambda x: x.lower())

In [7]:
data_df.head()

Unnamed: 0,id,year,title,abstract,paper_text
0,1,1987,self-organization of associative database and ...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,a mean field theory of layer iv of visual cort...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,storing covariance by the associative long-ter...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,bayesian query construction for neural network...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"neural network ensembles, cross validation, an...",Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [8]:
author_df.head()

Unnamed: 0,id,name
0,1,Hisashi Suzuki
1,10,David Brady
2,100,Santosh S. Venkatesh
3,1000,Charles Fefferman
4,10000,Artur Speiser


# 2. Processing the text 

In [9]:
def text_processing(df,col):
    temp_df = df[col]
    # 1.Remove punctuation
    temp_df = temp_df.apply(lambda x: re.sub('[[^a-zA-Z]]',' ',x))
    # 2. converting lower case
    temp_df = temp_df.apply(lambda x: x.lower())
    # 3. removing special character and digit
    temp_df = temp_df.apply(lambda x: re.sub("(\\d|\\W)+"," ",x))
    # 4. removing the single letter word
    temp_df = temp_df.apply(lambda x: re.sub(r"\b[a-zA-Z]\b", "",x))
    return temp_df

In [10]:
data_df['paper_text'] = text_processing(data_df,'paper_text')

In [11]:
def tokenize_lemmatize(df,col):
    temp_df =df[col]
    #1. Word Tokenization:
    temp_df = temp_df.apply(lambda x : word_tokenize(x))
    word_no_pre = temp_df.apply(lambda x: len(x))
    temp_df = temp_df.apply(lambda x : [i for i in x if not i in stopwords.words('english')])
    #2. Word Lemmatization:
    lemmatize =WordNetLemmatizer()
    temp_df = temp_df.apply(lambda x: [lemmatize.lemmatize(i) for i in x])
    word_no_post =temp_df.apply(lambda x: len(x))
    temp_df = temp_df.apply(lambda x: " ".join(x))
    return temp_df,word_no_pre,word_no_post

In [12]:
data_df['text'],data_df['word_count_pre'],data_df['word_count_post']=tokenize_lemmatize(data_df,'paper_text')

In [13]:
data_df.head()

Unnamed: 0,id,year,title,abstract,paper_text,text,word_count_pre,word_count_post
0,1,1987,self-organization of associative database and ...,Abstract Missing,self organization of associative database and...,self organization associative database applica...,3068,1830
1,10,1987,a mean field theory of layer iv of visual cort...,Abstract Missing,mean field theory of layer iv of visual cort...,mean field theory layer iv visual cortex appli...,2268,1320
2,100,1988,storing covariance by the associative long-ter...,Abstract Missing,storing covariance by the associative long te...,storing covariance associative long term poten...,2703,1834
3,1000,1994,bayesian query construction for neural network...,Abstract Missing,bayesian query construction for neural network...,bayesian query construction neural network mod...,2560,1672
4,1001,1994,"neural network ensembles, cross validation, an...",Abstract Missing,neural network ensembles cross validation and ...,neural network ensemble cross validation activ...,2983,1663


# 3. Saving the processed data  

In [14]:
def name_extraction(data,author):
    name =[]
    for data_id in data['id']:
        try:
            name.append(list(author[(author['id']==data_id)]['name'])[0])
        except:
            name.append(str('Author Missing'))
            continue
    return name

In [15]:
author_name = name_extraction(data_df,author_df)

In [16]:
data_df['author_name'] = pd.DataFrame(author_name)

In [17]:
data_df.drop(['paper_text','abstract'],axis=1,inplace=True)

In [18]:
data_df.head(10)

Unnamed: 0,id,year,title,text,word_count_pre,word_count_post,author_name
0,1,1987,self-organization of associative database and ...,self organization associative database applica...,3068,1830,Hisashi Suzuki
1,10,1987,a mean field theory of layer iv of visual cort...,mean field theory layer iv visual cortex appli...,2268,1320,David Brady
2,100,1988,storing covariance by the associative long-ter...,storing covariance associative long term poten...,2703,1834,Santosh S. Venkatesh
3,1000,1994,bayesian query construction for neural network...,bayesian query construction neural network mod...,2560,1672,Charles Fefferman
4,1001,1994,"neural network ensembles, cross validation, an...",neural network ensemble cross validation activ...,2983,1663,Scott Markel
5,1002,1994,using a neural net to instantiate a deformable...,sing neural net instantiate deformable model c...,3304,1979,Gregory J. Wolff
6,1003,1994,plasticity-mediated competitive learning,plasticity mediated competitive learning terre...,1558,1014,Timothy W. Cacciatore
7,1004,1994,iceg morphology classification using an analog...,iceg morphology classification using analogue ...,2531,1635,Paul Sajda
8,1005,1994,real-time control of a tokamak plasma using ne...,real time control tokamak plasma using neural ...,2472,1560,Leif H. Finkel
9,1006,1994,pulsestream synapses with non-volatile analogu...,real time control tokamak plasma using neural ...,4334,2799,Arun K. Jagota


# Detecting missing values 

In [26]:
data_df.fillna(np.nan,inplace=True)

In [27]:
data_df.isnull().sum()

id                 0
year               0
title              0
text               0
word_count_pre     0
word_count_post    0
author_name        0
dtype: int64

In [28]:
data_df.to_csv('process_data.csv',index=False)

# Summary:
* Firstly, importing the required libraries
* Importing the dataset 
* Removing the unrequired columns and removing rows which has missing values
* Removing punctuations,stopwords,whitespace and single letter word while processing the text
* Tokenizing and lemmatizing the tword tokens 
* Count the words,extracting the author name and embedding the following information in the dataframe.
* creating the CSV file to store the processed text