## Cleaning Dataset

In [1]:
import pandas as pd
import numpy as np
import re
import string

import pickle

In [None]:
# nltk.data.path.append('/Users/jhonsen/Documents/DS/nltk_data/')
# nltk.download('wordnet', download_dir='/Users/jhonsen/Documents/DS/nltk_data/')

### Importing Pickled Dataframe

In [2]:
# Unpickle the dataframe
with open('../data/dfraw.pkl','rb') as fin:
    df = pickle.load(fin)

In [3]:
df.head()

Unnamed: 0,title,summary
0,1001 Gaussia,Gaussia (minor planet designation: 1001 Gaussi...
1,List of minor planets: 13001–14000,The following is a partial list of minor plane...
2,1332 Marconia,"1332 Marconia, provisional designation 1934 AA..."
3,1548 capture of Aden,== Background ==\n\nThe Capture of Aden of 154...
4,1691 Oort,"1691 Oort, provisional designation 1956 RB, is..."


In [4]:
print('original dimension: ',df.shape)

original dimension:  (15490, 2)


#### Pre-processing entries in `summary` and `title` columns in dataframe
- [x] Remove articles containing "Lists_of_" in the `title`
- [x] Remove first 24 entries, unrelated to scientists
- [x] Drop any duplicates
- [ ] Remove additional textual inconsistencies

In [5]:
# Remove entries with "Lists", which are not articles about scientists (person)  
df = df[~df['title'].str.contains('List')]

In [6]:
# Remove first 24 entries, not related to scientists
df = df[df.index>24]

In [7]:
# Drop duplicated rows
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

(15044, 2)
(15042, 2)


In [8]:
# Replace \n with  space ' '
df['summary']= df['summary'].apply(lambda s: s.replace('\n',' '))

In [9]:
# For now, remove observations containing '== Background ==' in `summary`
df = df[~df['summary'].str.contains('==')]

In [10]:
# Check for articles with empty `summary` 
df[df.summary=='']

Unnamed: 0,title,summary
13119,"Seok-Hyun_\Andy\""_Yun""",
13873,"Temperance_\Bones\""_Brennan""",
14137,"Thomas_J._\Long_Tom\""_Roberts""",
14944,"William_\Bill\""_Ralph_Merton""",


In [11]:
# Removing those empty summaries
df.drop(index=13119, inplace=True)
df.drop(index=13873, inplace=True)
df.drop(index=14137, inplace=True)
df.drop(index=14944, inplace=True)

In [None]:
# Remove observations that are not scientists, titles such as Aristotle's works, Albert Einstein's Theory of Relativity

#

In [None]:
# Remove observations that are entitled 'Theory_', which are not persons (scientists)

#

In [12]:
print('dimension after processing: ', df.shape)

dimension after processing:  (13625, 2)


In [14]:
# Write processed data as csv, just in case
df.to_csv('../data/df_processed.csv',index=False)

In [15]:
# Pickle the processed file
with open('../data/fclean.pkl','wb') as fout:
    pickle.dump(df, fout)

Next Step:
- Topic Modeling of these articles, Step3_Modeling.ipynb

---