# Feature Engineering of the Tweets


## Import all libraries

In [1]:
import numpy as np 
import pandas as pd 
import string
import re
import spacy

## Load Data

In [2]:

train = pd.read_csv('/content/drive/My Drive/Tweets/2. Input/tweets train.csv')
test = pd.read_csv('/content/drive/My Drive/Tweets/2. Input/tweets test.csv')

In [3]:
train[:5]

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test[:5]

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

## Check for missing values

In [6]:
train.isna().sum()

keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
test.isna().sum()

keyword       26
location    1105
text           0
dtype: int64

# Handle Missing Values

### 1. Fill Nan as 'missing'

### 2. Create a Flag Col indicating the presence & absence of missing values

In [8]:
def missing_vals_clean(train): # ***** fn - 1 *****

  train = train.fillna('missing')
  train['keyword_missing'] = train['keyword']
  train['location_missing'] = train['location']
  train['keyword_missing'] =train['keyword_missing'].apply(lambda x : 1 if x=='missing' else 0)
  train['location_missing'] = train['location_missing'].apply(lambda x : 1 if x=='missing' else 0)

  return train

# Extract Features from the text column

In [9]:
def add_features(train): # ***** fn - 2 *****

  train['word_count'] = train['text'].apply(lambda x: len(str(x).split()))
  train['unique_word_count'] = train['text'].apply(lambda x: len(set(str(x).split())))
  train['mean_word_length'] = train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
  train['char_count'] = train['text'].apply(lambda x: len(str(x)))
  train['punctuation_count'] = train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
  train['alphabets'] = train['text'].apply(lambda x:sum(c.isalpha() for c in x))
  train['digits'] = train['text'].apply(lambda x:sum(c.isdigit() for c in x))

  return train

# FEATURES EXTRACTION USING SPACY:
 

In [10]:
nlp = spacy.load("en_core_web_sm")

### Spacy Helper Functions

In [11]:
def noun_extract(ss):
    doc = nlp(ss)
    doc2 = []
    for i in doc:
        if i.pos_ == 'NOUN' or i.pos_=='PROPN':
          k = str(i)  
          doc2.append(k)      
    d = ' '.join(doc2)
    return d

def propn_extract(ss):
    doc = nlp(ss)
    doc2 = []
    for i in doc:
        if i.pos_=='PROPN':
          k = str(i)  
          doc2.append(k)     
    d = ' '.join(doc2)
    return d

def noun_only_extract(d):
    doc = nlp(d)
    doc2 = []
    for i in doc:
        if i.pos_=='NOUN':
          k = str(i)  
          doc2.append(k)             
    d = ' '.join(doc2)
    return d

def verbs_extract(d):
    doc = nlp(d)
    doc2 = []
    for i in doc:
        if i.pos_=='VERB':
          k = str(i)  
          doc2.append(k)             
    d = ' '.join(doc2)
    return d

In [12]:
def noun_len(ss):
    doc = nlp(ss)
    doc2 = []
    for i in doc:
        if i.pos_ == 'NOUN' or i.pos_=='PROPN':
          k = str(i)  
          doc2.append(k)      
    return len(doc2)

def propn_len(ss):
    doc = nlp(ss)
    doc2 = []
    for i in doc:
        if i.pos_=='PROPN':
          k = str(i)  
          doc2.append(k)     
    
    return len(doc2)

def noun_only_len(d):
    doc = nlp(d)
    doc2 = []
    for i in doc:
        if i.pos_=='NOUN':
          k = str(i)  
          doc2.append(k)             
    
    return len(doc2)

def verbs_len(d):
    doc = nlp(d)
    doc2 = []
    for i in doc:
        if i.pos_=='VERB':
          k = str(i)  
          doc2.append(k)             
    
    return len(doc2)

In [13]:
def spacy_features(train): # ***** fn - 3 *****

  train['all_nouns'] = train['text'].apply(noun_extract)
  train['special_nouns'] = train['text'].apply(propn_extract)
  train['normal_nouns'] = train['text'].apply(noun_only_extract)
  train['verbs'] = train['text'].apply(verbs_extract)

  train['all_nouns_len'] = train['text'].apply(noun_len)
  train['special_nouns_len'] = train['text'].apply(propn_len)
  train['normal_nouns_len'] = train['text'].apply(noun_only_len)
  train['verbs_len'] = train['text'].apply(verbs_len)

  return train

# CLEANING

In [14]:
def cleaning(r):
    r = r.lower() #convert all str to lower case
    pun = string.punctuation
    for i in pun:
        r = r.replace(i, '') # remove punctuations
    r = re.sub(r"http\S+", "", r) # remove url
    r = re.sub('[0-9]+', '', r) # remove digits

    return r

In [15]:
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')
stop_words = list(STOP_WORDS)

def stopword_clean(r):
    r2 = r.split()
    r3 = ' '.join((filter(lambda s: s not in stop_words, r2)))
    return r3

In [16]:
def lemmatize(r):
    b = nlp(r)
    c = []
    for i in b:
        k = i.lemma_   
        c.append(k)
    d = ' '.join(c)
    return d

In [17]:
def pronoun_clean(r):
    r = r.replace('-PRON-','')
    r = r.replace('  ',' ')
    return r

In [18]:
def all_clean(train): # ***** fn - 4 *****

  train['simple_clean'] = train['text'].apply(cleaning)
  train['no_stopword'] = train['simple_clean'].apply(stopword_clean)
  train['lemmatized'] = train['no_stopword'].apply(lemmatize)
  train['no_pronoun'] = train['lemmatized'].apply(pronoun_clean)

  return train

# Feature Engineering of Train data

In [19]:
train = missing_vals_clean(train)
train = add_features(train)
train = spacy_features(train)
train = all_clean(train)

# Feature Engineering of Test data

In [20]:
test = missing_vals_clean(test)
test = add_features(test)
test = spacy_features(test)
test = all_clean(test)

# Save the Features in drive for future use

In [21]:
train.to_csv('tweets_train_features.csv', index=False)
!cp tweets_train_features.csv "drive/My Drive/Tweets/3. Feature Engineering/"

test.to_csv('tweets_test_features.csv', index=False)
!cp tweets_test_features.csv "drive/My Drive/Tweets/3. Feature Engineering/"

# New Features Added

In [25]:
train.shape, test.shape

((7613, 25), (3263, 24))

In [26]:
train[:5]

Unnamed: 0,keyword,location,text,target,keyword_missing,location_missing,word_count,unique_word_count,mean_word_length,char_count,punctuation_count,alphabets,digits,all_nouns,special_nouns,normal_nouns,verbs,all_nouns_len,special_nouns_len,normal_nouns_len,verbs_len,simple_clean,no_stopword,lemmatized,no_pronoun
0,missing,missing,Our Deeds are the Reason of this #earthquake M...,1,1,1,13,13,4.384615,69,1,56,0,Deeds Reason # earthquake,,Deeds Reason # earthquake,May ALLAH Forgive,4,0,4,3,our deeds are the reason of this earthquake ma...,deeds reason earthquake allah forgive,deeds reason earthquake allah forgive,deeds reason earthquake allah forgive
1,missing,missing,Forest fire near La Ronge Sask. Canada,1,1,1,7,7,4.571429,38,1,31,0,Forest fire La Ronge Sask Canada,La Ronge Sask Canada,Forest fire,,6,4,2,0,forest fire near la ronge sask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,missing,missing,All residents asked to 'shelter in place' are ...,1,1,1,22,20,5.090909,133,3,109,0,residents place officers evacuation shelter pl...,,residents place officers evacuation shelter pl...,asked shelter notified expected,7,0,7,4,all residents asked to shelter in place are be...,residents asked shelter place notified officer...,resident ask shelter place notify officer evac...,resident ask shelter place notify officer evac...
3,missing,missing,"13,000 people receive #wildfires evacuation or...",1,1,1,8,8,7.125,65,2,50,5,people # wildfires evacuation orders California,California,people # wildfires evacuation orders,receive,6,1,5,1,people receive wildfires evacuation orders in...,people receive wildfires evacuation orders cal...,people receive wildfire evacuation order calif...,people receive wildfire evacuation order calif...
4,missing,missing,Just got sent this photo from Ruby #Alaska as ...,1,1,1,16,15,4.5,88,2,70,0,photo Ruby # Alaska smoke wildfires school,Ruby # Alaska,photo smoke wildfires school,got sent pours,7,3,4,3,just got sent this photo from ruby alaska as s...,got sent photo ruby alaska smoke wildfires pou...,get send photo ruby alaska smoke wildfires pou...,get send photo ruby alaska smoke wildfires pou...


In [27]:
test[:5]

Unnamed: 0,keyword,location,text,keyword_missing,location_missing,word_count,unique_word_count,mean_word_length,char_count,punctuation_count,alphabets,digits,all_nouns,special_nouns,normal_nouns,verbs,all_nouns_len,special_nouns_len,normal_nouns_len,verbs_len,simple_clean,no_stopword,lemmatized,no_pronoun
0,missing,missing,Just happened a terrible car crash,1,1,6,6,4.833333,34,0,29,0,car crash,,car crash,happened,2,0,2,1,just happened a terrible car crash,happened terrible car crash,happen terrible car crash,happen terrible car crash
1,missing,missing,"Heard about #earthquake is different cities, s...",1,1,9,9,6.222222,64,3,53,0,earthquake cities,,earthquake cities,Heard # stay,2,0,2,3,heard about earthquake is different cities sta...,heard earthquake different cities stay safe,hear earthquake different city stay safe,hear earthquake different city stay safe
2,missing,missing,"there is a forest fire at spot pond, geese are...",1,1,19,19,4.105263,96,2,76,0,forest fire spot pond geese street,spot pond geese,forest fire street,fleeing can save,6,3,3,3,there is a forest fire at spot pond geese are ...,forest fire spot pond geese fleeing street save,forest fire spot pond geese flee street save,forest fire spot pond geese flee street save
3,missing,missing,Apocalypse lighting. #Spokane #wildfires,1,1,4,4,9.25,40,3,34,0,lighting Spokane # wildfires,Spokane,lighting # wildfires,,4,1,3,0,apocalypse lighting spokane wildfires,apocalypse lighting spokane wildfires,apocalypse light spokane wildfire,apocalypse light spokane wildfire
4,missing,missing,Typhoon Soudelor kills 28 in China and Taiwan,1,1,8,8,4.75,45,0,36,2,Typhoon Soudelor China Taiwan,Typhoon Soudelor China Taiwan,,kills,4,4,0,1,typhoon soudelor kills in china and taiwan,typhoon soudelor kills china taiwan,typhoon soudelor kill china taiwan,typhoon soudelor kill china taiwan
