In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from text_mining import TextMining

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/antoine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/antoine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data_train = pd.read_csv('../data/train_tweets.csv')

In [3]:
data_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
tm = TextMining(data_train)
data_train = (
    tm.lowercase()
        .extract_target_char("#", "hashtags")
        .extract_target_char("@", "mentions")
        .extract_url()
        .clean_regex()
        .tokenize()
        .remove_stopwords()
        .apply_lemmatizer() # .apply_lemmatizer() ou .apply_stemmer()
        .vectorize(mode="tfidf") # ou mode="bow" ou mode="tfidf"
        .get_df()
)
tm.export_csv("lemmatizer_tfidf.csv")
data_train.head(15)

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens,vector
0,1,,,our deeds are the reason of this may allah for...,1,earthquake,,,"[deed, reason, may, allah, forgive, u]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,4,,,forest fire near la ronge sask canada,1,,,,"[forest, fire, near, la, ronge, sask, canada]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,5,,,all residents asked to shelter in place are ...,1,,,,"[resident, asked, shelter, place, notified, of...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,6,,,13 000 people receive evacuation orders in cal...,1,wildfires,,,"[13, 000, people, receive, evacuation, order, ...","[0.0, 0.4503689519347456, 0.0, 0.0, 0.0, 0.0, ..."
4,7,,,just got sent this photo from ruby as smoke fr...,1,"alaska, wildfires",,,"[got, sent, photo, ruby, smoke, pours, school]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,8,,,update california hwy 20 closed in both dir...,1,"rockyfire, cafire, wildfires",,,"[update, california, hwy, 20, closed, directio...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,10,,,heavy rain causes flash flooding of streets in...,1,"flood, disaster",,,"[heavy, rain, cause, flash, flooding, street, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,13,,,i m on top of the hill and i can see a fire in...,1,,,,"[top, hill, see, fire, wood]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,14,,,there s an emergency evacuation happening now ...,1,,,,"[emergency, evacuation, happening, building, a...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,15,,,i m afraid that the tornado is coming to our a...,1,,,,"[afraid, tornado, coming, area]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [5]:
mentions_count = data_train['mentions'].apply(lambda x: isinstance(x, str) and x.strip() != "").sum()
hashtags_count = data_train['hashtags'].apply(lambda x: isinstance(x, str) and x.strip() != "").sum()
urls_count = data_train['urls'].apply(lambda x: isinstance(x, str) and x.strip() != "").sum()

print(f"Mentions : {mentions_count}")
print(f"Hashtags : {hashtags_count}")
print(f"URLs : {urls_count}")

data_train[
    (data_train['mentions'].apply(lambda x: isinstance(x, str) and x.strip() != "")) |
    (data_train['hashtags'].apply(lambda x: isinstance(x, str) and x.strip() != "")) |
    (data_train['urls'].apply(lambda x: isinstance(x, str) and x.strip() != ""))
].head()


Mentions : 2009
Hashtags : 1743
URLs : 3971


Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens,vector
0,1,,,our deeds are the reason of this may allah for...,1,earthquake,,,"[deed, reason, may, allah, forgive, u]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,6,,,13 000 people receive evacuation orders in cal...,1,wildfires,,,"[13, 000, people, receive, evacuation, order, ...","[0.0, 0.4503689519347456, 0.0, 0.0, 0.0, 0.0, ..."
4,7,,,just got sent this photo from ruby as smoke fr...,1,"alaska, wildfires",,,"[got, sent, photo, ruby, smoke, pours, school]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,8,,,update california hwy 20 closed in both dir...,1,"rockyfire, cafire, wildfires",,,"[update, california, hwy, 20, closed, directio...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,10,,,heavy rain causes flash flooding of streets in...,1,"flood, disaster",,,"[heavy, rain, cause, flash, flooding, street, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [6]:
data_train[data_train['mentions'].str.strip() != ""].head()

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens,vector
31,48,ablaze,birmingham,wholesale markets ablaze,1,,bbcmtd,http://t.co/lhyxeohy6c,"[wholesale, market, ablaze]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
36,54,ablaze,pretoria,they ve built so much hype around new acquisit...,0,mufc,phdsquares,,"[built, much, hype, around, new, acquisition, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
43,63,ablaze,,soooo pumped for ablaze,0,,southridgelife,,"[soooo, pumped, ablaze]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
54,78,ablaze,abuja,noches el bestia happy to see my teammates a...,0,,alexis_sanchez,http://t.co/uc4j4jhvgr',"[noches, el, bestia, happy, see, teammate, tra...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
63,91,ablaze,"concord, ca",steve these fires out here are something else ...,1,,"navista7, news24680",,"[steve, fire, something, else, california, tin...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [7]:
data_train[data_train['hashtags'].str.strip() != ""].head()

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens,vector
0,1,,,our deeds are the reason of this may allah for...,1,earthquake,,,"[deed, reason, may, allah, forgive, u]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,6,,,13 000 people receive evacuation orders in cal...,1,wildfires,,,"[13, 000, people, receive, evacuation, order, ...","[0.0, 0.4503689519347456, 0.0, 0.0, 0.0, 0.0, ..."
4,7,,,just got sent this photo from ruby as smoke fr...,1,"alaska, wildfires",,,"[got, sent, photo, ruby, smoke, pours, school]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,8,,,update california hwy 20 closed in both dir...,1,"rockyfire, cafire, wildfires",,,"[update, california, hwy, 20, closed, directio...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,10,,,heavy rain causes flash flooding of streets in...,1,"flood, disaster",,,"[heavy, rain, cause, flash, flooding, street, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
data_train[data_train['urls'].str.strip() != ""].head()

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens,vector
31,48,ablaze,birmingham,wholesale markets ablaze,1,,bbcmtd,http://t.co/lhyxeohy6c,"[wholesale, market, ablaze]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
32,49,ablaze,est. september 2012 - bristol,we always try to bring the heavy,0,"metal, rt",,http://t.co/yao1e0xngw,"[always, try, bring, heavy]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
33,50,ablaze,africa,breaking news nigeria flag set ablaze in aba,1,africanbaze,,http://t.co/2nndbgwyei,"[breaking, news, nigeria, flag, set, ablaze, aba]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
35,53,ablaze,"london, uk",on plus side look at the sky last night it was...,0,,,http://t.co/qqsmshaj3n,"[plus, side, look, sky, last, night, ablaze]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
37,55,ablaze,world wide!!,inec office in abia set ablaze,1,,,http://t.co/3imaomknna,"[inec, office, abia, set, ablaze]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
