In [8]:
import pandas as pd
import spacy
from tqdm import tqdm

In [16]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'tagger', 'ner'])
stops = nlp.Defaults.stop_words

In [3]:
# Constants
TRAIN_DATA = "./data/train.csv"
TEST_DATA = "./data/test.csv"

In [4]:
train_data = pd.read_csv(TRAIN_DATA)
train_data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
test_data = pd.read_csv(TEST_DATA)
test_data.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
train_tweets = train_data["text"].values
train_tweets[0:10]

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       '13,000 people receive #wildfires evacuation orders in California ',
       'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
       '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
       '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas',
       "I'm on top of the hill and I can see a fire in the woods...",
       "There's an emergency evacuation happening now in the building across the street",
       "I'm afraid that the tornado is coming to our area..."],
      dtype=object)

In [7]:
test_tweets = test_data["text"].values
test_tweets[0:10]

array(['Just happened a terrible car crash',
       'Heard about #earthquake is different cities, stay safe everyone.',
       'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
       'Apocalypse lighting. #Spokane #wildfires',
       'Typhoon Soudelor kills 28 in China and Taiwan',
       "We're shaking...It's an earthquake",
       "They'd probably still show more life than Arsenal did yesterday, eh? EH?",
       'Hey! How are you?', 'What a nice hat?', 'Fuck off!'], dtype=object)

In [23]:
def preprocess(tweet):
    tweet = tweet.lower()
    
    tweet_nlp = nlp(tweet)
    lemmatized = []
    
    for word in tweet_nlp:
        lemma = word.lemma_.strip()
        if lemma:
            if lemma not in stops:
                lemmatized.append(lemma)
    
    return " ".join(lemmatized)

In [24]:
train_processed_tweets = []

for tweet in tqdm(train_tweets):
    train_processed_tweets.append(preprocess(tweet))

100%|█████████████████████████████████████████████████████████████████████████████| 7613/7613 [00:16<00:00, 452.68it/s]


In [25]:
test_processed_tweets = []

for tweet in tqdm(test_tweets):
    test_processed_tweets.append(preprocess(tweet))

100%|█████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:07<00:00, 449.28it/s]


In [26]:
train_processed_tweets[0:10]

['deeds reason # earthquake allah forgive',
 'forest fire near la ronge sask . canada',
 "residents asked ' shelter place ' notified officers . evacuation shelter place orders expected",
 '13,000 people receive # wildfires evacuation orders california',
 'got sent photo ruby # alaska smoke # wildfires pours school',
 '# rockyfire update = > california hwy . 20 closed directions lake county fire - # cafire # wildfires',
 '# flood # disaster heavy rain causes flash flooding streets manitou , colorado springs areas',
 'hill fire woods ...',
 'emergency evacuation happening building street',
 'afraid tornado coming area ...']

In [27]:
test_processed_tweets[0:10]

['happened terrible car crash',
 'heard # earthquake different cities , stay safe .',
 'forest fire spot pond , geese fleeing street , save',
 'apocalypse lighting . # spokane # wildfires',
 'typhoon soudelor kills 28 china taiwan',
 'shaking ... earthquake',
 'probably life arsenal yesterday , eh ? eh ?',
 'hey ! ?',
 'nice hat ?',
 'fuck !']

In [28]:
train_data["processed_text"] = train_processed_tweets
train_data.head()

Unnamed: 0,text,target,processed_text
0,Our Deeds are the Reason of this #earthquake M...,1,deeds reason # earthquake allah forgive
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask . canada
2,All residents asked to 'shelter in place' are ...,1,residents asked ' shelter place ' notified off...
3,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive # wildfires evacuation o..."
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby # alaska smoke # wildfires...


In [30]:
test_data["processed_text"] = test_processed_tweets
test_data.head()

Unnamed: 0,text,processed_text
0,Just happened a terrible car crash,happened terrible car crash
1,"Heard about #earthquake is different cities, s...","heard # earthquake different cities , stay safe ."
2,"there is a forest fire at spot pond, geese are...","forest fire spot pond , geese fleeing street ,..."
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting . # spokane # wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 china taiwan


In [31]:
train_data.to_csv("./data/processed_train.csv", index=False)
test_data.to_csv("./data/processed_test.csv", index=False)