In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer
from text_mining import TextMining

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/antoine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_train = pd.read_csv('../data/train_tweets.csv')
data_test = pd.read_csv('../data/test_tweets.csv')

In [3]:
data_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
tm = TextMining(data_train)
data_train = (
    tm
      .extract_target_char("#", "hashtags")
      .extract_target_char("@", "mentions")
      .extract_url()
      .lowercase()
      .clean_regex()
      .tokenize()
      .get_df()
)


In [6]:
mentions_count = data_train['mentions'].apply(lambda x: isinstance(x, str) and x.strip() != "").sum()
hashtags_count = data_train['hashtags'].apply(lambda x: isinstance(x, str) and x.strip() != "").sum()
urls_count = data_train['urls'].apply(lambda x: isinstance(x, str) and x.strip() != "").sum()

print(f"Mentions : {mentions_count}")
print(f"Hashtags : {hashtags_count}")
print(f"URLs : {urls_count}")

data_train[
    (data_train['mentions'].apply(lambda x: isinstance(x, str) and x.strip() != "")) |
    (data_train['hashtags'].apply(lambda x: isinstance(x, str) and x.strip() != "")) |
    (data_train['urls'].apply(lambda x: isinstance(x, str) and x.strip() != ""))
].head()


Mentions : 2009
Hashtags : 1743
URLs : 3971


Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens
0,1,,,our deeds are the reason of this earthquake ma...,1,earthquake,,,"[our, deeds, are, the, reason, of, this, earth..."
3,6,,,13 000 people receive wildfires evacuation ord...,1,wildfires,,,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,just got sent this photo from ruby alaska as s...,1,"alaska, wildfires",,,"[just, got, sent, this, photo, from, ruby, ala..."
5,8,,,rockyfire update california hwy 20 closed i...,1,"rockyfire, cafire, wildfires",,,"[rockyfire, update, california, hwy, 20, close..."
6,10,,,flood disaster heavy rain causes flash floodin...,1,"flood, disaster",,,"[flood, disaster, heavy, rain, causes, flash, ..."


In [12]:
data_train[data_train['mentions'].str.strip() != ""].head()

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens
31,48,ablaze,birmingham,bbcmtd wholesale markets ablaze,1,,bbcmtd,http://t.co/lhyxeohy6c,"[bbcmtd, wholesale, markets, ablaze]"
36,54,ablaze,pretoria,phdsquares mufc they ve built so much hype aro...,0,mufc,phdsquares,,"[phdsquares, mufc, they, ve, built, so, much, ..."
43,63,ablaze,,soooo pumped for ablaze southridgelife,0,,southridgelife,,"[soooo, pumped, for, ablaze, southridgelife]"
54,78,ablaze,abuja,noches el bestia alexis sanchez happy to see...,0,,alexis_sanchez,http://t.co/uc4j4jhvgr',"[noches, el, bestia, alexis, sanchez, happy, t..."
63,91,ablaze,"concord, ca",navista7 steve these fires out here are someth...,1,,"navista7, news24680",,"[navista7, steve, these, fires, out, here, are..."


In [13]:
data_train[data_train['hashtags'].str.strip() != ""].head()

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens
0,1,,,our deeds are the reason of this earthquake ma...,1,earthquake,,,"[our, deeds, are, the, reason, of, this, earth..."
3,6,,,13 000 people receive wildfires evacuation ord...,1,wildfires,,,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,just got sent this photo from ruby alaska as s...,1,"alaska, wildfires",,,"[just, got, sent, this, photo, from, ruby, ala..."
5,8,,,rockyfire update california hwy 20 closed i...,1,"rockyfire, cafire, wildfires",,,"[rockyfire, update, california, hwy, 20, close..."
6,10,,,flood disaster heavy rain causes flash floodin...,1,"flood, disaster",,,"[flood, disaster, heavy, rain, causes, flash, ..."


In [14]:
data_train[data_train['urls'].str.strip() != ""].head()

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls,tokens
31,48,ablaze,birmingham,bbcmtd wholesale markets ablaze,1,,bbcmtd,http://t.co/lhyxeohy6c,"[bbcmtd, wholesale, markets, ablaze]"
32,49,ablaze,est. september 2012 - bristol,we always try to bring the heavy metal rt,0,"metal, rt",,http://t.co/yao1e0xngw,"[we, always, try, to, bring, the, heavy, metal..."
33,50,ablaze,africa,africanbaze breaking news nigeria flag set ab...,1,africanbaze,,http://t.co/2nndbgwyei,"[africanbaze, breaking, news, nigeria, flag, s..."
35,53,ablaze,"london, uk",on plus side look at the sky last night it was...,0,,,http://t.co/qqsmshaj3n,"[on, plus, side, look, at, the, sky, last, nig..."
37,55,ablaze,world wide!!,inec office in abia set ablaze,1,,,http://t.co/3imaomknna,"[inec, office, in, abia, set, ablaze]"
