In [1]:
import re
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
trainfile = open("training_docs.txt",encoding='utf-8')
traindata = trainfile.readlines()
trainfile.close()

In [3]:
testfile = open("testing_docs.txt",encoding='utf-8')
testdata = testfile.readlines()
testfile.close()

In [4]:
labelfile = open("training_labels_final.txt",encoding='utf-8')
trainlabel = labelfile.readlines()
labelfile.close()

In [5]:
test_id = []
for i in range(0,len(testdata),4):
    test_id.append(testdata[i])
    
test_text = []
for i in range(1,len(testdata),4):
    test_text.append(testdata[i])

In [6]:
train_id = []
for i in range(0,len(traindata),4):
    train_id.append(traindata[i])
    
train_text = []
for i in range(1,len(traindata),4):
    train_text.append(traindata[i])

In [7]:
train_df = pd.DataFrame(
    {   'ID': train_id,
        'text': train_text
    })

train_df.head()

Unnamed: 0,ID,text
0,ID tr_doc_1\n,TEXT Two German tourists have been found safe ...
1,ID tr_doc_2\n,TEXT ACT police have seized a rare drug during...
2,ID tr_doc_3\n,TEXT A 50-year-old Brisbane man has been charg...
3,ID tr_doc_4\n,TEXT In-depth discussions are continuing to re...
4,ID tr_doc_5\n,TEXT Homicide detectives are still questioning...


In [8]:
testing_df = pd.DataFrame(
    {   'ID': test_id,
        'text': test_text
    })

testing_df.head()

Unnamed: 0,ID,text
0,ID te_doc_1\n,TEXT The Police Royal Commission in Western Au...
1,ID te_doc_2\n,TEXT The Northern Territory Government says it...
2,ID te_doc_3\n,"TEXT A group of hepatitis C sufferers, who wer..."
3,ID te_doc_4\n,TEXT The crew of the North Korean vessel Pong ...
4,ID te_doc_5\n,TEXT The New South Wales Supreme Court has bee...


In [9]:
train_df['ID'] = train_df['ID'].map(lambda x: x.lstrip('ID').rstrip('\n'))
train_df['text'] = train_df['text'].map(lambda x: x.lstrip('TEXT').rstrip('\n'))

testing_df['ID'] = testing_df['ID'].map(lambda x: x.lstrip('ID').rstrip('\n'))
testing_df['text'] = testing_df['text'].map(lambda x: x.lstrip('TEXT').rstrip('\n'))

train_doc_id = train_df['ID']
test_doc_id = testing_df['ID']

In [10]:
trainlabel = [re.sub("tr_doc_\d*","",elem) for elem in trainlabel]
trainlabel = [t.rstrip().lstrip() for t in trainlabel]

In [11]:
frames = [train_df, testing_df]
combined = pd.concat(frames)

In [12]:
lemmatizer = nltk.stem.WordNetLemmatizer()

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

stop = stopwords.words('english')

combined['text']=combined['text'].str.lower()

tokenise = RegexpTokenizer(r'\w+')

In [13]:
combined['text']=combined['text'].apply(lambda x: tokenise.tokenize(x))

In [14]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

combined['text'] = combined.text.apply(lemmatize_text)

In [15]:
def removeNonAplhabet(text):
    return [w for w in text if w.isalpha()]

In [16]:
combined.text = combined.text.apply(removeNonAplhabet)

In [17]:
combined.shape

(133055, 2)

In [18]:
combined['text']=combined['text'].apply(lambda x: [item for item in x if item not in stop])

In [19]:
combined["text_ngram3"] = ""
combined["text_ngram3"] = combined["text"].apply(lambda x: list(ngrams(x,3)))

In [20]:
trigram = {}
for i in combined["text_ngram3"]:
    for j in i:
        if j in trigram:
            trigram[j] += 1
        else:
            trigram[j] = 1

In [21]:
selected_trigrams = []
for k,v in trigram.items():
    if v > 1064:
        selected_trigrams.append(k)

In [22]:
tokens_trigrams = MWETokenizer(selected_trigrams)

combined["text"] = combined["text"].apply(tokens_trigrams.tokenize)

In [23]:
combined["text_ngram2"] = ""
combined["text_ngram2"] = combined["text"].apply(lambda x: list(ngrams(x,2)))

In [24]:
bigram = {}
for i in combined["text_ngram2"]:
    for j in i:
        if j in bigram:
            bigram[j] += 1
        else:
            bigram[j] = 1

In [25]:
selected_bigram = []
for k,v in bigram.items():
    if v > 2112 and v < 104313:
        selected_bigram.append(k)
        

In [26]:
tokens_bigram = MWETokenizer(selected_bigram)

combined["text"] = combined["text"].apply(tokens_bigram.tokenize)

In [27]:
combined["text_ngram1"] = ""
combined["text_ngram1"] = combined["text"].apply(lambda x: list(ngrams(x,1)))

In [28]:
unigram = {}
for i in combined["text_ngram1"]:
    for j in i:
        if j in unigram:
            unigram[j] += 1
        else:
            unigram[j] = 1

In [29]:
selected_unigrams = []
for k,v in unigram.items():
    if v > 2539 and v < 103600:
        selected_unigrams.append(k)

In [30]:
tokens_unigrams = MWETokenizer(selected_unigrams)

combined["text"] = combined["text"].apply(tokens_unigrams.tokenize)

In [31]:
final_list = []
for i in range(len(selected_unigrams)):
    final_list.append(selected_unigrams[i][0])
    
for i in range(len(selected_bigram)):
    final_list.append(selected_bigram[i][0])

for i in range(len(selected_trigrams)):
    final_list.append(selected_trigrams[i][0])
    
len(final_list)

1377

In [32]:
combined['text_final'] = ''
combined['text_final'] = combined['text'].apply(lambda x: [item for item in x if item in final_list])

In [33]:
vectorizer = TfidfVectorizer()

In [34]:
combined['final'] = combined['text_final'].apply(lambda x: " ".join(x))

In [35]:
combined_df = pd.DataFrame(vectorizer.fit_transform(combined['final']).toarray(),columns=vectorizer.get_feature_names())

In [36]:
train_df = combined_df[0:106445]
test_df = combined_df[106445:]

In [37]:
train_doc_id = list(train_doc_id)
test_doc_id = list(test_doc_id)

In [38]:
train_df['doc_id'] = ""
train_df['label'] = ""

train_df['doc_id'] = train_doc_id
train_df['label'] = trainlabel


test_df['doc_id'] = ""
test_df['doc_id'] = test_doc_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://

In [39]:
train_df.to_csv("train_dataset.csv",encoding='utf8',sep=",")
test_df.to_csv("test_dataset.csv",encoding='utf8',sep=",")