forked from AshokR/TamilNLP
/
POSTagRDR.py
53 lines (44 loc) · 2.13 KB
/
POSTagRDR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/python
# -*- coding: utf-8 -*-
import nltk.data
import re
from RDRPOSTagger import *
rdr = RDRPOSTagger()
# Load the POS tagging model for Tamil
trainedmodel = os.path.join(os.path.dirname(__file__), 'Resources/TrainedModel.RDR')
rdr.constructSCRDRtreeFromRDRfile(trainedmodel)
# Load the lexicon for Tamil
generatedlexicon = os.path.join(os.path.dirname(__file__), 'Resources/GeneratedLexicon.DICT')
DICT = readDictionary(generatedlexicon)
# Load the file that contains the text to be tagged
tobetagged = nltk.data.load('/home/ashok/TamilCorpora/Fiction/WikiSource.txt')
# Open the target file for writing the tagged text
targetfile = open('/home/ashok/Machlearn/Sentseg/test_sentences.txt', 'wt')
# Segment the text into sentences and then into tokens (words and punctuation)
sentencesegment = os.path.join(os.path.dirname(__file__), 'Resources/SentenceSegment_Python2.pickle')
tokenizer = nltk.data.load(sentencesegment)
seg = tokenizer.tokenize(tobetagged)
# Read sentences one-by-one
for sent in seg:
# Strip all double quotes - POS tagger is unable to handle these
sent = re.sub(r'"', '', sent)
# Strip all single quotes - POS tagger is unable to handle these
sent = re.sub(r'\'', '', sent)
# Change all ! to dot - POS tagger is unable to handle !
sent = re.sub(r'!', '.', sent)
# Change all ; to , - POS tagger is unable to handle ;
sent = re.sub(r';', ',', sent)
# Change all : to , - POS tagger is unable to handle :
sent = re.sub(r':', ',', sent)
# Add a space in front of the sentence ending dot - tagging is done based on whitespace
sent = re.sub(r'\.\Z', ' .', sent)
# Add a space in front of the sentence ending ? - tagging is done based on whitespace
sent = re.sub(r'\?', ' ?', sent)
# Add a space in front of the paragraph ending dot - tagging is done based on whitespace
sent = re.sub(r'\.\n', ' .', sent)
# Add a space in front of every comma - tagging is done based on whitespace
sent = re.sub(r',', ' ,', sent)
# Tag the sentence and write it to the file
tagged = rdr.tagRawSentence(DICT, sent.encode('utf8'))
print tagged
targetfile.write(tagged + '\n')