In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
fl = pd.read_csv('../data/coordinates_florida_hurricane.csv', index_col = 0)
lu = pd.read_csv('../data/coordinates_louisiana_hurricane.csv', index_col = 0)
pr = pd.read_csv('../data/coordinates_puertorico_hurricane.csv', index_col = 0)
sc = pd.read_csv('../data/coordinates_southcarolina_hurricane.csv', index_col = 0)
tx = pd.read_csv('../data/coordinates_texas_hurricane.csv', index_col = 0)

In [3]:
df = pd.concat([fl, lu, pr, sc, tx], axis = 0)

In [4]:
df.head()

Unnamed: 0_level_0,Text,Date,lat,long
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ShamarlaB_,A customer service rep told me Friday “there i...,2020-08-31 23:15:25+00:00,25.525284,-80.60692
saborhavanacig,Tomorrow at 7pm After Hours with Sabor Havana ...,2020-08-31 22:39:25+00:00,27.686273,-80.934588
RauschInsurance,Hurricane Laura wallops areas with high mortga...,2020-08-31 22:31:09+00:00,27.701712,-75.255859
mitchylovesyou,I never wish bad on anyone but I think we need...,2020-08-31 20:25:29+00:00,29.114762,-84.339632
__ashleycarla,Wth is pricemart so full????? Is their a hurri...,2020-08-31 19:51:39+00:00,28.506867,-89.67809


In [5]:
df.shape

(41101, 4)

In [6]:
df.isnull().sum()

Text    0
Date    0
lat     0
long    0
dtype: int64

In [8]:
# instantiate lemmatizer and tokenizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

In [9]:
## tokenize and lemmatize

# create a column 'text_clean'

df['text_clean'] = ''

# iterate through each row in the column text_all
    
for i in range(len(df['Text'])):
        
    # tokenize each word in text into its own string
    text_token = []
    text_token.extend(tokenizer.tokenize(df['Text'][i].lower()))
    text_tokens = []
    [text_tokens.append(word) for word in text_token if word not in text_tokens]
        
    # lemmatize the words
    text_lemmatize = []
    for j in range(len(text_tokens)):
        text_lemmatize.append(lemmatizer.lemmatize(text_tokens[j]))
        
    # remove characters and numbers
    clean_text = []
    for k in range(len(text_lemmatize)):
        clean_text.append(re.sub('[^a-zA-Z]', '', text_lemmatize[k]))    
        
    # group them together
    texts_collection = [text for text in clean_text]
    
    # put the words back to one long string for vectorization
    texts_collection = ' '.join(texts_collection)

    # fill new column
    df['text_clean'][i] = texts_collection

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
df.head()

Unnamed: 0_level_0,Text,Date,lat,long,text_clean
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ShamarlaB_,A customer service rep told me Friday “there i...,2020-08-31 23:15:25+00:00,25.525284,-80.60692,a customer service rep told me friday there is...
saborhavanacig,Tomorrow at 7pm After Hours with Sabor Havana ...,2020-08-31 22:39:25+00:00,27.686273,-80.934588,tomorrow at pm after hour with sabor havana ci...
RauschInsurance,Hurricane Laura wallops areas with high mortga...,2020-08-31 22:31:09+00:00,27.701712,-75.255859,hurricane laura wallop area with high mortgage...
mitchylovesyou,I never wish bad on anyone but I think we need...,2020-08-31 20:25:29+00:00,29.114762,-84.339632,i never wish bad on anyone but think we need a...
__ashleycarla,Wth is pricemart so full????? Is their a hurri...,2020-08-31 19:51:39+00:00,28.506867,-89.67809,wth is pricemart so full their a hurricane id...


In [11]:
df= df[['Date', 'Text', 'text_clean', 'lat', 'long']]

In [12]:
df.head()

Unnamed: 0_level_0,Date,Text,text_clean,lat,long
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ShamarlaB_,2020-08-31 23:15:25+00:00,A customer service rep told me Friday “there i...,a customer service rep told me friday there is...,25.525284,-80.60692
saborhavanacig,2020-08-31 22:39:25+00:00,Tomorrow at 7pm After Hours with Sabor Havana ...,tomorrow at pm after hour with sabor havana ci...,27.686273,-80.934588
RauschInsurance,2020-08-31 22:31:09+00:00,Hurricane Laura wallops areas with high mortga...,hurricane laura wallop area with high mortgage...,27.701712,-75.255859
mitchylovesyou,2020-08-31 20:25:29+00:00,I never wish bad on anyone but I think we need...,i never wish bad on anyone but think we need a...,29.114762,-84.339632
__ashleycarla,2020-08-31 19:51:39+00:00,Wth is pricemart so full????? Is their a hurri...,wth is pricemart so full their a hurricane id...,28.506867,-89.67809


In [13]:
df.to_csv('../data/clean_hurricane.csv', index = False)