In [109]:
import pandas as pd
import string
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

In [35]:
df = pd.read_csv('../datasets/train.csv')

In [36]:
df.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [43]:
df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [44]:
def TextPreprocessing(text):
    text = ' '.join((''.join([' ' if i in string.punctuation else i for i in text])).split())
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    tokens = [word.lower() for word in tokens]
    new_tokens = [i for i in tokens if i not in stopwords.words('english')]
    tokens = [word for word in new_tokens if len(word) >= 3]
    stemmer = PorterStemmer()

    tokens = [stemmer.stem(word) for word in tokens]
    
    tagged_corpus = pos_tag(tokens)

    Noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    lemmatizer = WordNetLemmatizer()
    
    def prac_lemmatize(token, tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token, 'n')
        elif tag in verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')
    lemmatized_text = ' '.join([prac_lemmatize(token, tag) for token, tag in tagged_corpus])
    
    return lemmatized_text 

In [138]:
preprocessed_text = df.text.apply(TextPreprocessing)
finished = time.perf_counter()

In [139]:
df['preprocessed_text'] = preprocessed_text

In [140]:
df.head()

Unnamed: 0,id,keyword,location,text,target,preprocessed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,000 peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,get sent photo rubi alaska smoke wildfir pour ...


In [141]:
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words= 'english', max_features = 5000, strip_accents = 'unicode', norm = 'l2')
X = vectorizer.fit_transform(preprocessed_text).todense()

In [142]:
model = LGBMClassifier()
model.fit(X, df.target)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [143]:
pre = model.predict_proba(X)[:,1]

In [144]:
roc_auc_score(np.array(df.target), pre)

0.9266250909511315