In [11]:
#File: dataPreprocessing.ipynb
#Purpose: data preprocessing for steve mapping dataset
#Author: Quan Gan
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv('../data/Raw_data/steve_mapping_1000.csv', skiprows = 1)

In [13]:
info = df[['collectionCode', 'habitat', 'higherGeography', 'locality', 'higherClassification', 'Steve label']]

In [14]:
#recrods' labels clean
info = info.replace(np.nan, "", regex=True)
info = info[info['Steve label'] != ""]
info = info.replace('\n', " ", regex=True)
info = info.replace('Lake,', "Lake", regex=True)

In [15]:
#add prefix "__label__" to labels
labels = info['Steve label'].str.split(r', ', expand=True)
labels = labels.rename(columns= {0: "label1", 1: "label2"})
labels['label1'] = "__label__" + labels['label1']
labels = labels.fillna("")
labels.loc[labels['label2'] != "", 'label2'] = "__label__"+labels.loc[labels['label2'] != ""]['label2'] 

In [16]:
result = labels.replace(" ", "_", regex=True)

In [17]:
#records' text clean
info['text'] = info['collectionCode'] + " " + info['habitat'] + " " + info['higherGeography'] + " " + info['locality'] + " " + info['higherClassification']
info['text'] = info['text'].str.replace('[^\w\s]','', regex=True)
info['text'] = info['text'].str.lower()

In [18]:
#sample records to trainset and testset
result['label'] = result['label1'] + " " + result['label2']
result['text'] = info['text']
ds = result['label'] + " " + result['text']
train = ds.sample(frac=0.7, random_state=99) #randomly choose 70% records as trainset
test = ds.loc[~ds.index.isin(train.index)] #rest 30% records as testset

In [19]:
ds.to_csv(r'../data/cleanedData.txt', header=False, index=False)
train.to_csv(r'../data/steve_696.train', header=False, index=False)
test.to_csv(r'../data/steve_299.valid', header=False, index=False)

In [105]:
result = result.sample(n=100, random_state=2)

In [None]:
result.to_csv('DwC_cleaned.csv')

In [107]:
small_80 = result.sample(frac=0.8, random_state=2)
small_20 = result.iloc[~result.index.isin(small_80.index)]

In [108]:
(small_80['label']+' '+small_80['text']).to_csv('small_80.txt', header=False, index=False)
(small_20['label']+' '+small_20['text']).to_csv('small_20.txt', header=False, index=False)

In [109]:
#Method: trainModel
#Purpose: train fastText model
#Paremater: trainSet -> the trainSet file path
#           input_word_vector -> the pretrained word vector file path
import fasttext
def trainModel(trainSet, input_word_vector, LR, Epoch):
    model = fasttext.train_supervised(input = trainSet,
                                      dim = 300,
                                      lr = LR,
                                      epoch = Epoch,
                                      loss ='ova',
                                      pretrainedVectors = input_word_vector)
    return model

In [110]:
model = trainModel('small_80.txt', '../data/crawl-300d-2M-subword.vec', 0.5, 20)

In [111]:
pre = small_20['text'].tolist()
output = pd.DataFrame(columns=['test-label', 'Probability'])
output['test-label'] = model.predict(pre)[0]
output['Probability'] = model.predict(pre)[1]

model.test('small_20.txt',k=1)

(19, 0.9473684210526315, 0.9)

In [125]:

testlabel = model.test_label('small_20.txt', k=1)
testlabel


{'__label__Marine_water_body_bottom': {'precision': 0.0,
  'recall': nan,
  'f1score': 0.0},
 '__label__Subaerial_surface_environment': {'precision': 1.0,
  'recall': nan,
  'f1score': 2.0},
 '__label__Lake_river_or_stream_bottom': {'precision': 1.0,
  'recall': nan,
  'f1score': 2.0},
 '__label__Marine_water_body': {'precision': 1.0,
  'recall': nan,
  'f1score': 2.0},
 '__label__Active_human_occupation_site': {'precision': nan,
  'recall': nan,
  'f1score': nan}}

In [113]:
pd.concat([pd.DataFrame(small_20['label']).reset_index(drop=True), output], axis=1).to_csv('result.csv')