# 1. Importing libraries and dataset 

In [1]:
import pandas as pd 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
import warnings

In [2]:
data_df=pd.read_csv("nips-papers/papers.csv")

In [3]:
data_df.drop(['pdf_name','event_type'],axis=1,inplace=True)

# 2. Processing the text 

In [4]:
def text_processing(df,col):
    temp_df = df[col]
    # 1.Remove punctuation
    temp_df = temp_df.apply(lambda x: re.sub('[[^a-zA-Z]]',' ',x))
    # 2. converting lower case
    temp_df = temp_df.apply(lambda x: x.lower())
    # 3. removing special character and digit
    temp_df = temp_df.apply(lambda x: re.sub("(\\d|\\W)+"," ",x))
    return temp_df

In [5]:
data_df['paper_text'] = text_processing(data_df,'paper_text')

In [6]:
def tokenize_lemmatize(df,col):
    temp_df =df[col]
    #1. Word Tokenization:
    temp_df = temp_df.apply(lambda x : word_tokenize(x))
    word_no_pre = temp_df.apply(lambda x: len(x))
    temp_df = temp_df.apply(lambda x : [i for i in x if not i in stopwords.words('english')])
    #2. Word Lemmatization:
    lemmatize =WordNetLemmatizer()
    temp_df = temp_df.apply(lambda x: [lemmatize.lemmatize(i) for i in x])
    word_no_post =temp_df.apply(lambda x: len(x))
    temp_df = temp_df.apply(lambda x: " ".join(x))
    return temp_df,word_no_pre,word_no_post

In [7]:
data_df['text'],data_df['word_count_pre'],data_df['word_count_post']=tokenize_lemmatize(data_df,'paper_text')

In [14]:
data_df.head()

Unnamed: 0,id,year,title,abstract,paper_text,text,word_count_pre,word_count_post
0,1,1987,Self-Organization of Associative Database and ...,Abstract Missing,self organization of associative database and...,self organization associative database applica...,3539,2057
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,Abstract Missing,a mean field theory of layer iv of visual cor...,mean field theory layer iv visual cortex appli...,2504,1417
2,100,1988,Storing Covariance by the Associative Long-Ter...,Abstract Missing,storing covariance by the associative long te...,storing covariance associative long term poten...,3003,1998
3,1000,1994,Bayesian Query Construction for Neural Network...,Abstract Missing,bayesian query construction for neural network...,bayesian query construction neural network mod...,3245,2070
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",Abstract Missing,neural network ensembles cross validation and ...,neural network ensemble cross validation activ...,3242,1820


# 3. Saving the processed data  

In [16]:
data_df.drop(['paper_text','abstract'],axis=1,inplace=True)

In [19]:
data_df.to_csv('process_data.csv',index=False)