-
Notifications
You must be signed in to change notification settings - Fork 2
/
7_createDatasets.py
77 lines (59 loc) · 3.48 KB
/
7_createDatasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
def execute():
path = "/mnt/data/datasets/patents/patent_matching"
positives =pd.read_csv(path+"/positives_satellite.csv",header=0,dtype={'application_claim_text': str,'patent_searchReport_paragraph':str})
negatives =pd.read_csv(path+"/negatives_satellite.csv",header=0,dtype={'application_claim_text': str,'patent_searchReport_paragraph':str})
sample_size=1.0
positives = positives[['application_claim_text', 'patent_searchReport_paragraph']]
positives["label"]="1"
positives = positives.rename(columns={"application_claim_text": "text", "patent_searchReport_paragraph": "text_b"})
negatives = negatives[['application_claim_text', 'patent_searchReport_paragraph']]
negatives["label"]="0"
negatives = negatives.rename(columns={"application_claim_text": "text", "patent_searchReport_paragraph": "text_b"})
allSamples = positives.append(negatives).dropna()
# Remove "</p" at the end of paragraphs
allSamples['text_b'] = allSamples['text_b'].str.replace('<\/p', '', regex=True)
allSamples['text'] = allSamples['text'].str.replace('<\/p', '', regex=True)
# Remove everything written in "<...>"
allSamples['text_b'] = allSamples['text_b'].str.replace('\<.+?\>', '', regex=True)
allSamples['text'] = allSamples['text'].str.replace('\<.+?\>', '', regex=True)
# Remove --->
allSamples['text_b'] = allSamples['text_b'].str.replace('--\>', '', regex=True)
allSamples['text'] = allSamples['text'].str.replace('--\>', '', regex=True)
# Remove remaining "
allSamples['text_b'] = allSamples['text_b'].str.replace('\"', '', regex=True)
allSamples['text'] = allSamples['text'].str.replace('\"', '', regex=True)
#For the rest: Keep only letter, numbers, whitespace, dot
allSamples['text_b'] = allSamples['text_b'].str.replace('[^A-Za-z0-9\s.]+', '', regex=True)
allSamples['text'] = allSamples['text'].str.replace('[^A-Za-z0-9\s.]+', '', regex=True)
#Remove leading whitespaces
allSamples['text_b'].replace("^\s", '', regex=True, inplace=True)
allSamples['text'].replace("^\s", '', regex=True, inplace=True)
#Remove space characters
allSamples['text_b'].replace('\B\s+|\s+\B', '', regex = True, inplace=True)
allSamples['text'].replace('\B\s+|\s+\B', '', regex=True, inplace=True)
#Replace empty strings with Nan, so dropna removes it
allSamples['text_b'].replace("^[\s]*$", np.nan, regex=True, inplace=True)
allSamples['text'].replace("^[\s]*$", np.nan, regex=True, inplace=True)
#train,test,dev=train_test_dev_split(allSamples,0.4,0.3,0.3)
allSamples = allSamples.sort_values(by=['text']).dropna()
train, test_dev = train_test_split(allSamples,test_size=0.2, shuffle=False)
test,dev = train_test_split(test_dev,test_size=0.5,shuffle=False)
#Shuffle and sample from data
train = train.sample(frac=sample_size)
test = test.sample(frac=sample_size)
dev = dev.sample(frac=sample_size)
print("Check for intersection values:")
print("Train in Test")
print((train['text'].isin(test['text'])).value_counts())
print("Train in Dev")
print((train['text'].isin(dev['text'])).value_counts())
print("Test in Dev")
print((test['text'].isin(dev['text'])).value_counts())
train.to_csv(path+"/train.tsv", sep="\t", index=False)
test.to_csv(path+"/test.tsv", sep="\t", index=False)
dev.to_csv(path+"/dev.tsv", sep="\t", index=False)
if __name__ == '__main__':
execute()