In [1]:
import pandas as pd
import numpy as np
import regex as re
import pickle
import string
from langdetect import detect

pd.set_option('display.max_colwidth', -1)

pd.set_option('display.max_rows', 1000)
#pd.set_option('display.max_columns', 500)

## Data cleaning hurricane tweets

### Read in all hurricane tweets from pickle

In [2]:
hurricanes = pd.read_csv("../Data/hurricane_tweets.csv")
floods = pd.read_csv("../Data/df_floods.csv")
fires = pd.read_csv("../Data/all_fires.csv")

  interactivity=interactivity, compiler=compiler, result=result)


#### Merge dataframes into one

In [6]:
print(hurricanes['disaster'].value_counts())
print(hurricanes.shape)
print(floods['disaster'].value_counts())
print(floods.shape)
print(fires['disaster'].value_counts())
print(fires.shape)

hurricane    36436
Name: disaster, dtype: int64
(36444, 22)
floods    896
Name: disaster, dtype: int64
(896, 23)
fire    40382
Name: disaster, dtype: int64
(40382, 23)


In [56]:
# merge dataframes together
df = pd.concat([hurricanes, floods, fires], sort = False)

In [57]:
df.drop(columns = "Unnamed: 0", inplace = True)

In [58]:
df["text"] = df['text'].astype(str)

In [225]:
# for text in hurricanes["text"]:
#     try:
#         detect(text)
#     except:
#         noLang

In [156]:
# for key, valu in onlyText.items():
#         try:
#             if detect(val[0]) !="en":
#                 foreignLangs[key]= val
#                 foreignLangs[key].append(detect(val[0]))

#         except:
#             noLang[key] = val

In [157]:
# hurricanes["language"] = hurricanes["text"].apply(detect)

## Clean text columns

In [59]:
df = df[["text", "disaster"]]

In [54]:
df["text"].head(250)

0      OFFICALLY TROPICAL STORM DORIAN Where is it Going? Tropical Depression 5 Hurricane Dorian Track 2019 https://youtu.be/SKCqARFvsQw  The latest on the STORM'S TRACK!  in the above YOUTUBE LINK!!! @FlyRts @FearRTs @GFXCoach #dorian #florida #hurricane #hurricanedorian #tropicalstormdorianpic.twitter.com/RpMN7ewuLs                                                                                         
1      Tropical Storm Dorian Projected Path, Spaghetti Models #Dorian #TropicalStormDorian #HurricaneDorian #SpaghettiModels http://www.brevardtimes.com/2019/08/noaa-tropical-storm-dorian-projected-path-spaghetti-models/amp/ …pic.twitter.com/t4O2L6wEr3                                                                                                                                                            
2      Futura tormenta tropical #Dorian pasando por el sur de Puerto Rico.\n#DorianPR #Caturrito #TormentaTropical #Temporadadehuracanes #Boletin #hurricane #HurricaneDorian #StormDo

In [36]:
df.iloc[0,0]

"OFFICALLY TROPICAL STORM DORIAN Where is it Going? Tropical Depression 5 Hurricane Dorian Track 2019 https://youtu.be/SKCqARFvsQw\xa0 The latest on the STORM'S TRACK!  in the above YOUTUBE LINK!!! @FlyRts @FearRTs @GFXCoach #dorian #florida #hurricane #hurricanedorian #tropicalstormdorianpic.twitter.com/RpMN7ewuLs"

In [37]:
df.iloc[6,0]

'#TDFIVE TO BECOME A #Hurricane THIS WEEK\n\nA system, located hundreds of miles from the Lesser Antilles, is expected to become #TropicalStormDorian tomorrow.  It is also forecast to become #HurricaneDorian later this week!  Start preparing now!\n\n#apexwx #tropics #Atlantic #stormpic.twitter.com/MsRpq4mRRZ'

In [38]:
#df[df["text"].str.contains("blog")].head(100)

In [30]:
# # this code was adapted from this stackoverflow answer
# # https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression
# def strip_all_entities(text):
#     entity_prefixes = ['@','#']
#     for separator in  string.punctuation:
#         if separator not in entity_prefixes :
#             text = text.replace(separator,'')
#     words = []
#     for word in text.split():
#         word = word.strip()
#         if word:
#             if word[0] not in entity_prefixes:
#                 words.append(word)
#     return ' '.join(words)

In [60]:
# lowercase text
df["text"] = df["text"].str.lower()



# remove URLs
df['text'] = df['text'].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))


# remove URL cutoffs
df['text'] = df['text'].map(lambda x: re.sub('\\[^\s]*', ' ', x))



# remove spaces
df['text'] = df['text'].map(lambda x: re.sub('\n', ' ', x))


# remove picture URLs
df['text'] = df['text'].map(lambda x: re.sub('pic.twitter.com\/[^\s]*', ' ', x))

# remove blog/map type
df['text'] = df['text'].map(lambda x: re.sub('blog\/maps\/info\/[^\s]*', ' ', x))



# remove hashtags =
df["text"] = df["text"].map(lambda x: re.sub("\#[\w]*", "", x))


# remove AT users
df["text"] = df["text"].map(lambda x: re.sub("\@[\w]*", "", x))

#df['text'] = df['text'].apply(strip_all_entities)



# remove single quotations
df["text"] = df["text"].map(lambda x: re.sub("'", "", x))
df["text"] = df["text"].map(lambda x: re.sub("'", "", x))




# remove characters that are not word characters or digits
df["text"] = df["text"].map(lambda x: re.sub("[^\w\d]", " ", x))

# remove all characters that are not letters
df['text'] = df['text'].map(lambda x: re.sub("[^a-zA-Z]", " ", x))

# remove multiple spaces
df['text'] = df['text'].map(lambda x: re.sub("\s{2,6}", " ", x))

In [62]:
# remove tweets with this url type
df = df[~df["text"].str.contains("blogmapsinfo")]

In [63]:
df.shape

(77722, 2)

In [64]:
# drop duplicate rows
df.drop_duplicates(subset='text', keep='first', inplace=True)

In [103]:
# remove multiple spaces
df['text'] = df['text'].map(lambda x: re.sub("\s{3,20}", "", x))

In [104]:
# drop row with only one space
df = df[~(df["text"]== " ")]

In [105]:
# drop row with multiple spaces
df = df[~(df["text"]== "  ")]

In [106]:
# drop row with multiple spaces
df = df[~(df["text"]== " ")]

In [107]:
# drop empty row
df = df[~(df["text"]== "")]

## Detect languages of tweets

In [21]:
# this code was used to test for errors that would prevent the detect function from running
# languages = []
# for i in range(101,150):
#     try:
#         languages.append(detect(df.iloc[i, 0]))
#     except:
#         print(f"error in row {i}")

In [127]:
# apply detect function on text column
df["languages"] = df["text"].apply(detect)

In [128]:
df.shape

(63711, 3)

In [130]:
## Select for tweets that are English only
## this dropped 3_335 rows 
df_en = df[df["languages"] == "en"]

In [131]:
df_en.shape

(60352, 3)

## Continue cleaning on english column

Here we are removing multiple copies of the same letter. For example "thanksssssssss" is updated to "thanks".

In [132]:
# Wrote this as a function but it took forever to run, so breaking it out individually
# Saving code for future reference

# # list of all English letters
# letters = list(string.ascii_lowercase)

# # list of letters that typically don't repeat twice in an English word
# double_letters = ["q", "u", "w", "y"]

# def remove_repeats(letters):
#     for letter in letters:
#         if letter in double_letters:
#             df_en["text"].map(lambda x: re.sub(re.escape(letter)+"{2,10}", re.escape(letter), x))
#         else:
#             df_en["text"].map(lambda x: re.sub(re.escape(letter)+"{3,10}", re.escape(letter), x))


# df_en.loc[:, "text"] = df_en["text"].map(remove_repeats)

df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("a{3,10}", "a", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("b{3,10}", "b", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("c{3,10}", "c", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("d{3,10}", "d", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("e{3,10}", "e", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("f{3,10}", "f", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("g{3,10}", "g", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("h{3,10}", "h", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("i{3,10}", "i", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("j{3,10}", "j", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("k{3,10}", "k", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("l{3,10}", "l", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("m{3,10}", "m", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("n{3,10}", "n", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("o{3,10}", "o", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("p{3,10}", "p", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("q{2,10}", "q", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("r{3,10}", "r", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("s{3,10}", "s", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("t{3,10}", "t", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("u{2,10}", "u", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("v{3,10}", "v", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("x{3,10}", "x", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("y{2,10}", "y", x))
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("z{3,10}", "z", x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [133]:
df_en.shape

(60352, 3)

## Write to CSV

In [134]:
# write to csf
df_en.to_csv("../Data/all_tweets_clean.csv", index = False)