Load values, perform word-analytics. Normalize

In [9]:
import torch
import pandas as pd
import re
import csv 
import string
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

def normalise_text(text): 
    html_space = re.compile("%20")
    newline_pattern =  re.compile("\\n([^0-9])")
    numeric_pattern = re.compile("([0-9]+),([0-9]{3},?)+")
    punctuation_pattern = re.compile("[^\w\s]")
    url_pattern = re.compile(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
    normalized_text = text
     
    while bool(html_space.search(normalized_text)):
        normalized_text = re.sub(html_space, r' ', normalized_text)
        
    while bool(newline_pattern.search(normalized_text)):
        normalized_text = re.sub(newline_pattern, r' \1', normalized_text)

    while bool(numeric_pattern.search(normalized_text)):
        normalized_text = re.sub(numeric_pattern, r'\1\2', normalized_text)
        
    normalized_text = re.sub(url_pattern, '', normalized_text)
    normalized_text = str.lower(normalized_text)
    
    lines = normalized_text.split('\n')

    lines = [x for x in csv.reader(lines, quotechar='"', delimiter=',',
               quoting=csv.QUOTE_ALL, skipinitialspace=True) if len(x) > 0]
    
    normalized_lines = []
    for line in lines:
        normalized_lines.append([re.sub(punctuation_pattern, '', x) for x in line])

    return normalized_lines



def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    blacklist = stopwords.words('english')
    tokenizer = TweetTokenizer()
    tokens = []
    if not text:
        return tokens
    
    tokens = tokenizer.tokenize(text)
    if any(tokens):
        tokens = [x for x in tokens if x not in blacklist]
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
    return tokens
    
def clean_data(dataframe):
    for row in dataframe.iterrows():
        row_data = row[1]
        keywords = clean_text(row_data.keyword)
        text = clean_text(row_data.text)
        
        if 'target' in dataframe:
            new_row = [row_data.id, keywords, row_data.location, text, row_data.target]
        else:
            new_row = [row_data.id, keywords, row_data.location, text]
        dataframe.iloc[row[0]] = new_row
    return dataframe

def load_text(file):
    with open(file) as f:
        lines = f.read()
    normalised_text = normalise_text(lines)

    data = pd.DataFrame(normalised_text[1:], columns=normalised_text[0])
    data = clean_data(data)
    return data

train_data = load_text('./data/train.csv')
#test_data = load_text('./data/test.csv')
train_data.to_csv('./data/normalized_train_data.csv', index=False)
print(train_data)



[nltk_data] Downloading package stopwords to /Users/jack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jack/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  arr_value = np.array(value)


         id keyword location  \
0         1      []            
1         4      []            
2         5      []            
3         6      []            
4         7      []            
...     ...     ...      ...   
7608  10869      []            
7609  10870      []            
7610  10871      []            
7611  10872      []            
7612  10873      []            

                                                   text target  
0     [deed, reason, earthquake, may, allah, forgive...      1  
1         [forest, fire, near, la, ronge, sask, canada]      1  
2     [resident, asked, shelter, place, notified, of...      1  
3     [13000, people, receive, wildfire, evacuation,...      1  
4     [got, sent, photo, ruby, alaska, smoke, wildfi...      1  
...                                                 ...    ...  
7608  [two, giant, crane, holding, bridge, collapse,...      1  
7609  [aria_ahrary, thetawniest, control, wild, fire...      1  
7610            [m194, 0104, u