In [1]:
''' define Feature class '''
%run Features.ipynb
features = Features('./DefaultFiles/train_bodies.csv','./DefaultFiles/train_stances.csv')
print(features.df.head(10))

   Body ID                                        articleBody  \
0        0  A small meteorite crashed into a wooded area i...   
1        0  A small meteorite crashed into a wooded area i...   
2        0  A small meteorite crashed into a wooded area i...   
3        0  A small meteorite crashed into a wooded area i...   
4        0  A small meteorite crashed into a wooded area i...   
5        0  A small meteorite crashed into a wooded area i...   
6        0  A small meteorite crashed into a wooded area i...   
7        0  A small meteorite crashed into a wooded area i...   
8        0  A small meteorite crashed into a wooded area i...   
9        0  A small meteorite crashed into a wooded area i...   

                                            Headline     Stance  
0  Soldier shot, Parliament locked down after gun...  unrelated  
1  Tourist dubbed ‘Spider Man’ after spider burro...  unrelated  
2  Luke Somers 'killed in failed rescue attempt i...  unrelated  
3   BREAKING: Soldie

In [2]:
'''
    For downloading for nltk
    import 
    on first time download the following packages
'''
import nltk
# nltk.download()
# d (Download) then enter 'punkt', wordnet', 'stopwords' (individually)

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import re

'''
    Process involves
    1. Split into sentences
    2. Split into words i.e [[word,word],[word,word,word]]
    3. Stem - chop of ends
    4. Lemmatise - remove inflection endings and return to base citionary
    5. remove stopwards
    6. only take words containing only letters
'''
def process( text ):
    out = []
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    for sentence in sent_tokenize(text):
        withoutStop = []
        for word in word_tokenize(sentence):
            word = word.strip()
            word = stemmer.stem(word)
            word = lemmatizer.lemmatize(word, wordnet.VERB)
            # major speed gain only testing for letters
            word = word.replace("n't", 'not')
            word = word.replace("'m", 'am')
            word = word.replace("'ve'", 'have')
            word = word.replace("'d", 'would')
            word = word.replace("'ll", "will")
            if word != '' and word.isalpha():
                withoutStop.append(word.lower())
        out.append(withoutStop)
    return out



In [8]:
example = features.df.iloc[0]['articleBody']
example

'A small meteorite crashed into a wooded area in Nicaragua\'s capital of Managua overnight, the government said Sunday. Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city\'s airport, the Associated Press reports. \n\nGovernment spokeswoman Rosario Murillo said a committee formed by the government to study the event determined it was a "relatively small" meteorite that "appears to have come off an asteroid that was passing close to Earth." House-sized asteroid 2014 RC, which measured 60 feet in diameter, skimmed the Earth this weekend, ABC News reports. \nMurillo said Nicaragua will ask international experts to help local scientists in understanding what happened.\n\nThe crater left by the meteorite had a radius of 39 feet and a depth of 16 feet,  said Humberto Saballos, a volcanologist with the Nicaraguan Institute of Territorial Studies who was on the committee. He said it is still not clear if the meteorite disintegrated or was buried.\n\nHumbe

In [9]:
for sentence in process( example ):
    print(" ".join(sentence))

a small meteorit crash into a wood area in nicaragua capit of managua overnight the govern say sunday
resid report hear a mysteri boom that leave a deep crater near the citi airport the associ press report
govern spokeswoman rosario murillo say a committe form by the govern to studi the event determin it wa a rel small meteorit that appear to have come off an asteroid that wa pass close to earth
asteroid rc which measur feet in diamet skim the earth thi weekend abc new report
murillo say nicaragua will ask intern expert to help local scientist in understand what happen
the crater leave by the meteorit have a radiu of feet and a depth of feet say humberto saballo a volcanologist with the nicaraguan institut of territori studi who wa on the committe
he say it be still not clear if the meteorit disintegr or wa buri
humberto garcia of the astronomi center at the nation autonom univers of nicaragua say the meteorit could be relat to an asteroid that wa forecast to pass by the planet saturda

In [21]:
''' Takes some time to completely process  ~ 12 minutes on my pc'''
import time
start_time = time.time()
print("total rows: ", len(features.df))

features.df['headProcessed'] = ''
features.df['bodyProcessed'] = ''
headProcessed = features.df.columns.get_loc("headProcessed")
bodyProcessed = features.df.columns.get_loc("bodyProcessed")
for index, row in features.df.iterrows():
    ''' by replacing outer join you can regain sentence structure if desired '''
    features.df.iat[index, headProcessed] = " ".join(" ".join(sent) for sent in process(row['Headline']))
    features.df.iat[index, bodyProcessed] = " ".join(" ".join(sent) for sent in process(row['articleBody']))
    if index % 1000 == 0:
        print("processed row: ", index, " time taken::", round(time.time()-start_time, 2), " seconds")

total rows:  49972
processed row:  0  time taken:: 0.04  seconds
processed row:  1000  time taken:: 31.99  seconds
processed row:  2000  time taken:: 55.79  seconds
processed row:  3000  time taken:: 80.17  seconds
processed row:  4000  time taken:: 102.48  seconds
processed row:  5000  time taken:: 127.3  seconds
processed row:  6000  time taken:: 146.84  seconds
processed row:  7000  time taken:: 173.4  seconds
processed row:  8000  time taken:: 191.21  seconds
processed row:  9000  time taken:: 215.94  seconds
processed row:  10000  time taken:: 231.97  seconds
processed row:  11000  time taken:: 248.22  seconds
processed row:  12000  time taken:: 264.3  seconds
processed row:  13000  time taken:: 284.35  seconds
processed row:  14000  time taken:: 307.57  seconds
processed row:  15000  time taken:: 329.67  seconds
processed row:  16000  time taken:: 345.47  seconds
processed row:  17000  time taken:: 365.84  seconds
processed row:  18000  time taken:: 383.22  seconds
processed row:

In [22]:
features.df.to_csv('dataProcessed.csv',index=False)

In [25]:
features = Features('./DefaultFiles/test_bodies.csv','./DefaultFiles/test_stances_unlabeled.csv')
import time
start_time = time.time()
print("total rows: ", len(features.df))

features.df['headProcessed'] = ''
features.df['bodyProcessed'] = ''
headProcessed = features.df.columns.get_loc("headProcessed")
bodyProcessed = features.df.columns.get_loc("bodyProcessed")
for index, row in features.df.iterrows():
    ''' by replacing outer join you can regain sentence structure if desired '''
    features.df.iat[index, headProcessed] = " ".join(" ".join(sent) for sent in process(row['Headline']))
    features.df.iat[index, bodyProcessed] = " ".join(" ".join(sent) for sent in process(row['articleBody']))
    if index % 1000 == 0:
        print("processed row: ", index, " time taken::", round(time.time()-start_time, 2), " seconds")

total rows:  25413
processed row:  0  time taken:: 0.02  seconds
processed row:  1000  time taken:: 17.45  seconds
processed row:  2000  time taken:: 44.77  seconds
processed row:  3000  time taken:: 62.88  seconds
processed row:  4000  time taken:: 81.8  seconds
processed row:  5000  time taken:: 103.29  seconds
processed row:  6000  time taken:: 124.16  seconds
processed row:  7000  time taken:: 142.55  seconds
processed row:  8000  time taken:: 160.21  seconds
processed row:  9000  time taken:: 183.49  seconds
processed row:  10000  time taken:: 205.65  seconds
processed row:  11000  time taken:: 229.53  seconds
processed row:  12000  time taken:: 244.02  seconds
processed row:  13000  time taken:: 265.63  seconds
processed row:  14000  time taken:: 286.03  seconds
processed row:  15000  time taken:: 302.01  seconds
processed row:  16000  time taken:: 317.28  seconds
processed row:  17000  time taken:: 335.89  seconds
processed row:  18000  time taken:: 357.57  seconds
processed row

In [26]:
features.df.to_csv('testProcessed.csv',index=False)