# Feature Generation

In [1]:
import pandas as pd
import numpy as np
import re, string

from tqdm import tqdm_notebook as tqdm

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
stopwords = set(stopwords.words("english"))

## Functions for extracting Date Time Features and Cleaning Text Data

In [8]:
def get_datetime(df):
    df['date'] = df['send_date'].str.slice(0,2)
    df['month'] = df['send_date'].str.slice(3,5)
    df['year'] = df['send_date'].str.slice(6,11)
    df['hour'] = df['send_date'].str.slice(11,13)
    df['mins'] = df['send_date'].str.slice(14,16)
    df.drop(['send_date','id'],axis=1,inplace=True)
    col = ['mins', 'hour', 'date', 'month', 'year'] 
    df[col] = df[col].astype('int')
    return df

def concat_camp(df, camp, camp_cols):
    df[camp_cols] = camp[camp['campaign_id'] == 29][camp_cols]
    for i in tqdm(range(29, 81)):
        df.loc[df['campaign_id']==i, camp_cols] = np.array(camp.loc[camp['campaign_id']==i, camp_cols])
    return df

def cleanData(text, stemming = False, lemmatize=False):    
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in text.split()])
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w) for w in text.split()])
    return text

def concat_camp_other(df, camp, cam_cols):
    df[cap_cols] = camp[camp['campaign_id'] == 29][cam_cols]
    for i in tqdm(range(29, 81)):
        df.loc[df['campaign_id']==i, cam_cols] = np.array(camp.loc[camp['campaign_id']==i, cam_cols])
    return df

## Function for extracting Features from Text Data

In [None]:
def get_features(df):
    df['count_sent']=df["subject"].apply(lambda x: len(re.findall("\n",str(x)))+1)
    df['count_word']=df["subject"].apply(lambda x: len(str(x).split()))
    df['count_unique_word']=df["subject"].apply(lambda x: len(set(str(x).split())))
    df['count_letters']=df["subject"].apply(lambda x: len(str(x)))
    df["count_punctuations"] =df["subject"].apply(lambda x: len([c for c in str(x) if c in 
                                                                      string.punctuation]))
    df["count_stopwords"] = df["subject"].apply(lambda x: len([w for w in str(x).lower().split() 
                                                                    if w in stopwords]))
    df["mean_word_len"] = df["subject"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df['word_unique_percent']=df['count_unique_word']*100/df['count_word']
    df['punct_percent']=df['count_punctuations']*100/df['count_word']
    
    df['email_count_word']=df["email_body"].apply(lambda x: len(str(x).split()))
    df['email_count_unique_word']=df["email_body"].apply(lambda x: len(set(str(x).split())))
    df['email_count_letters']=df["email_body"].apply(lambda x: len(str(x)))
    df["email_count_punctuations"] =df["email_body"].apply(lambda x: len([c for c in str(x) if c in 
                                                                      string.punctuation]))
    df['email_cap_count'] = df['email_body'].apply(lambda x: len(re.findall(r'[A-Z]', x)))
    
    return df

## Day of Week and Time of Day

In [11]:
from datetime import datetime

def get_wday_tday(df):
    day_of_week = lambda x: datetime.strptime(x, "%d-%m-%Y %H:%M").weekday()
    df['day_of_week'] = df['send_date'].map(day_of_week)
    times_of_day = [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5]
    time_of_day = lambda x: times_of_day[datetime.strptime(x, "%d-%m-%Y %H:%M").hour]
    df['time_of_day'] = df['send_date'].map(time_of_day)
    df['time_of_day'] = df['time_of_day'].astype('int')
    df['day_of_week'] = df['day_of_week'].astype('int')
    return df

## Read Train and Test Set

In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
train.head()

Unnamed: 0,id,user_id,campaign_id,send_date,is_open,is_click
0,42_14051,14051,42,01-09-2017 19:55,0,0
1,52_134438,134438,52,02-11-2017 12:53,0,0
2,33_181789,181789,33,24-07-2017 15:15,0,0
3,44_231448,231448,44,05-09-2017 11:36,0,0
4,29_185580,185580,29,01-07-2017 18:01,0,0


## Read Campaign Data

In [13]:
camp = pd.read_csv('input/campaign_data.csv')
camp_cols = ['communication_type', 'total_links', 'no_of_internal_links', 'no_of_images', 'no_of_sections', 
             'email_body', 'subject', 'email_url']
camp = get_features(camp)
camp.head()

Unnamed: 0,campaign_id,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url
0,29,Newsletter,67,61,12,3,"Dear AVians,\r\n \r\nWe are shaping up a super...",Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...
1,30,Upcoming Events,18,14,7,1,"Dear AVians,\r\n \r\nAre your eager to know wh...",[July] Data Science Expert Meetups & Competiti...,http://r.newsletters.analyticsvidhya.com/7up0e...
2,31,Conference,15,13,5,1,Early Bird Pricing Till August 07  Save upto ...,Last chance to convince your boss before the E...,http://r.newsletters.analyticsvidhya.com/7usym...
3,32,Conference,24,19,7,1,\r\n \r\nHi ?\r\n \r\nBefore I dive into why y...,A.I. & Machine Learning: 5 reasons why you sho...,http://r.newsletters.analyticsvidhya.com/7uthl...
4,33,Others,7,3,1,1,Fireside Chat with DJ Patil - the master is he...,"[Delhi NCR] Fireside Chat with DJ Patil, Forme...",http://r.newsletters.analyticsvidhya.com/7uvlg...


## Generate Features from Train and Test Data

### Date Time Features

In [5]:
train = get_datetime(train)
train = concat_camp(train, camp, camp_cols)
train.head()




Unnamed: 0,user_id,campaign_id,is_open,is_click,date,month,year,hour,mins,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url
0,14051,42,0,0,1,9,2017,19,55,Newsletter,88.0,79.0,13.0,4.0,"September Newsletter\r\n \r\nDear AVians,\r\n ...",[September] Exciting days ahead with DataHack ...,http://r.newsletters.analyticsvidhya.com/7v3rd...
1,134438,52,0,0,2,11,2017,12,53,Newsletter,67.0,62.0,10.0,4.0,"November Newsletter\r\n \r\nDear AVians,\r\n \...",[Newsletter] Stage for DataHack Summit 2017 is...,http://r.newsletters.analyticsvidhya.com/7vtb2...
2,181789,33,0,0,24,7,2017,15,15,Others,7.0,3.0,1.0,1.0,Fireside Chat with DJ Patil - the master is he...,"[Delhi NCR] Fireside Chat with DJ Patil, Forme...",http://r.newsletters.analyticsvidhya.com/7uvlg...
3,231448,44,0,0,5,9,2017,11,36,Upcoming Events,60.0,56.0,19.0,6.0,"[September Events]\r\n \r\nDear AVians,\r\n \r...","[September] Data Science Hackathons, Meetups a...",http://r.newsletters.analyticsvidhya.com/7veam...
4,185580,29,0,0,1,7,2017,18,1,Newsletter,67.0,61.0,12.0,3.0,"Dear AVians,\r\n \r\nWe are shaping up a super...",Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...


In [6]:
test = get_datetime(test)
test = concat_camp(test, camp, camp_cols)
test.head()




Unnamed: 0,campaign_id,user_id,date,month,year,hour,mins,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url
0,63,122715,1,2,2018,22,35,Newsletter,68.0,64.0,15.0,5.0,\r\nFebruary 2018 Newsletter\r\n \r\nDear AVia...,"AVbytes, Ultimate 2018 learning path and aweso...",http://r.newsletters.analyticsvidhya.com/7whsu...
1,56,76206,2,1,2018,8,15,Newsletter,42.0,38.0,10.0,4.0,\r\nJanuary 2018 Newsletter\r\n \r\nDear AVian...,[January] Year 2018 - Bigger & Exciting challe...,http://r.newsletters.analyticsvidhya.com/7w3uc...
2,57,96189,5,1,2018,18,25,Upcoming Events,40.0,36.0,15.0,4.0,"[January - Hiring Hackathons, LearnUps, Contes...","[January 2018] Upcoming Hiring Hackathons, Lea...",http://r.newsletters.analyticsvidhya.com/7w43t...
3,56,166917,2,1,2018,8,15,Newsletter,42.0,38.0,10.0,4.0,\r\nJanuary 2018 Newsletter\r\n \r\nDear AVian...,[January] Year 2018 - Bigger & Exciting challe...,http://r.newsletters.analyticsvidhya.com/7w3uc...
4,56,172838,2,1,2018,8,12,Newsletter,42.0,38.0,10.0,4.0,\r\nJanuary 2018 Newsletter\r\n \r\nDear AVian...,[January] Year 2018 - Bigger & Exciting challe...,http://r.newsletters.analyticsvidhya.com/7w3uc...


### Encode Communication Type to Integers

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train['communication_type'])
train['communication_type'] = le.transform(train['communication_type'])
test['communication_type'] = le.transform(test['communication_type'])
train.head()

Unnamed: 0,user_id,campaign_id,is_open,is_click,date,month,year,hour,mins,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url
0,14051,42,0,0,1,9,2017,19,55,3,88.0,79.0,13.0,4.0,"September Newsletter\r\n \r\nDear AVians,\r\n ...",[September] Exciting days ahead with DataHack ...,http://r.newsletters.analyticsvidhya.com/7v3rd...
1,134438,52,0,0,2,11,2017,12,53,3,67.0,62.0,10.0,4.0,"November Newsletter\r\n \r\nDear AVians,\r\n \...",[Newsletter] Stage for DataHack Summit 2017 is...,http://r.newsletters.analyticsvidhya.com/7vtb2...
2,181789,33,0,0,24,7,2017,15,15,4,7.0,3.0,1.0,1.0,Fireside Chat with DJ Patil - the master is he...,"[Delhi NCR] Fireside Chat with DJ Patil, Forme...",http://r.newsletters.analyticsvidhya.com/7uvlg...
3,231448,44,0,0,5,9,2017,11,36,5,60.0,56.0,19.0,6.0,"[September Events]\r\n \r\nDear AVians,\r\n \r...","[September] Data Science Hackathons, Meetups a...",http://r.newsletters.analyticsvidhya.com/7veam...
4,185580,29,0,0,1,7,2017,18,1,3,67.0,61.0,12.0,3.0,"Dear AVians,\r\n \r\nWe are shaping up a super...",Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...


In [None]:
cam_cols = ['total_links', 'no_of_internal_links', 'no_of_images', 'no_of_sections', 'email_body', 'subject'
             , 'email_url', 'count_sent', 'count_word', 'count_unique_word', 'count_letters', 
             'count_punctuations', 'count_stopwords', 'mean_word_len', 'word_unique_percent', 'punct_percent'
             , 'email_count_word', 'email_count_unique_word', 'email_count_letters', 
             'email_count_punctuations', 'email_cap_count']

train = concat_camp_other(train, camp, cam_cols)
test = concat_camp_other(test, camp, cam_cols)

In [4]:
train.to_csv('input/train_feat.csv', index=False)
test.to_csv('input/test_feat.csv', index=False)

### Subscription Period of a User

In [2]:
train = pd.read_csv('input/train_featv3.csv')
train['year'] = 2017
test = pd.read_csv('input/test_featv3.csv')
test['year'] = 2018

In [3]:
ids = pd.read_csv('input/ids.csv')

In [4]:
pd.options.display.max_columns=50
train.head()

Unnamed: 0,user_id,is_open,is_click,date,month,hour,mins,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_url,count_sent,count_word,count_unique_word,count_letters,count_punctuations,count_stopwords,mean_word_len,word_unique_percent,punct_percent,email_count_word,email_count_unique_word,email_count_letters,email_count_punctuations,email_cap_count,day_of_week,count_click,count_user,click_confidence,count_is_open,is_open_confidence,email_3_similar,sub_3_similar,campaign_id,year
0,14051,0,0,1,9,19,55,3,88.0,79.0,13.0,4.0,http://r.newsletters.analyticsvidhya.com/7v3rd...,1.0,12.0,12.0,88.0,5.0,2.0,6.416667,100.0,41.666667,238.0,159.0,1382.0,37.0,58.0,4,0.0,9.0,0.0,0.0,0.0,0.0,2.0,42,2017
1,134438,0,0,2,11,12,53,3,67.0,62.0,10.0,4.0,http://r.newsletters.analyticsvidhya.com/7vtb2...,1.0,16.0,16.0,111.0,5.0,3.0,6.0,100.0,31.25,166.0,122.0,971.0,24.0,48.0,3,0.0,4.0,0.0,0.0,0.0,1.0,2.0,52,2017
2,181789,0,0,24,7,15,15,4,7.0,3.0,1.0,1.0,http://r.newsletters.analyticsvidhya.com/7uvlg...,1.0,12.0,12.0,71.0,3.0,1.0,5.0,100.0,25.0,118.0,92.0,739.0,28.0,42.0,0,0.0,7.0,0.0,0.0,0.0,1.0,1.0,33,2017
3,231448,0,0,5,9,11,36,5,60.0,56.0,19.0,6.0,http://r.newsletters.analyticsvidhya.com/7veam...,1.0,10.0,10.0,73.0,3.0,2.0,6.4,100.0,30.0,43.0,38.0,273.0,9.0,9.0,1,0.0,6.0,0.0,0.0,0.0,1.0,1.0,44,2017
4,185580,0,0,1,7,18,1,3,67.0,61.0,12.0,3.0,http://r.newsletters.analyticsvidhya.com/7um44...,1.0,11.0,11.0,55.0,2.0,3.0,4.090909,100.0,18.181818,243.0,176.0,1498.0,37.0,35.0,5,0.0,5.0,0.0,0.0,0.0,0.0,1.0,29,2017


In [5]:
data = pd.concat([train, test], axis=0)
data.head()

Unnamed: 0,campaign_id,click_confidence,communication_type,count_click,count_is_open,count_letters,count_punctuations,count_sent,count_stopwords,count_unique_word,count_user,count_word,date,day_of_week,email_3_similar,email_cap_count,email_count_letters,email_count_punctuations,email_count_unique_word,email_count_word,email_url,hour,is_click,is_open,is_open_confidence,mean_word_len,mins,month,no_of_images,no_of_internal_links,no_of_sections,punct_percent,sub_3_similar,time_of_day,total_links,user_id,word_unique_percent,year
0,42,0.0,3,0.0,0.0,88.0,5.0,1.0,2.0,12.0,9.0,12.0,1,4,0.0,58.0,1382.0,37.0,159.0,238.0,http://r.newsletters.analyticsvidhya.com/7v3rd...,19,0.0,0.0,0.0,6.416667,55,9,13.0,79.0,4.0,41.666667,2.0,,88.0,14051,100.0,2017
1,52,0.0,3,0.0,0.0,111.0,5.0,1.0,3.0,16.0,4.0,16.0,2,3,1.0,48.0,971.0,24.0,122.0,166.0,http://r.newsletters.analyticsvidhya.com/7vtb2...,12,0.0,0.0,0.0,6.0,53,11,10.0,62.0,4.0,31.25,2.0,,67.0,134438,100.0,2017
2,33,0.0,4,0.0,0.0,71.0,3.0,1.0,1.0,12.0,7.0,12.0,24,0,1.0,42.0,739.0,28.0,92.0,118.0,http://r.newsletters.analyticsvidhya.com/7uvlg...,15,0.0,0.0,0.0,5.0,15,7,1.0,3.0,1.0,25.0,1.0,,7.0,181789,100.0,2017
3,44,0.0,5,0.0,0.0,73.0,3.0,1.0,2.0,10.0,6.0,10.0,5,1,1.0,9.0,273.0,9.0,38.0,43.0,http://r.newsletters.analyticsvidhya.com/7veam...,11,0.0,0.0,0.0,6.4,36,9,19.0,56.0,6.0,30.0,1.0,,60.0,231448,100.0,2017
4,29,0.0,3,0.0,0.0,55.0,2.0,1.0,3.0,11.0,5.0,11.0,1,5,0.0,35.0,1498.0,37.0,176.0,243.0,http://r.newsletters.analyticsvidhya.com/7um44...,18,0.0,0.0,0.0,4.090909,1,7,12.0,61.0,3.0,18.181818,1.0,,67.0,185580,100.0,2017


In [6]:
data = data.sort_values(by=['year', 'month', 'date'])
data.head()

Unnamed: 0,campaign_id,click_confidence,communication_type,count_click,count_is_open,count_letters,count_punctuations,count_sent,count_stopwords,count_unique_word,count_user,count_word,date,day_of_week,email_3_similar,email_cap_count,email_count_letters,email_count_punctuations,email_count_unique_word,email_count_word,email_url,hour,is_click,is_open,is_open_confidence,mean_word_len,mins,month,no_of_images,no_of_internal_links,no_of_sections,punct_percent,sub_3_similar,time_of_day,total_links,user_id,word_unique_percent,year
4,29,0.0,3,0.0,0.0,55.0,2.0,1.0,3.0,11.0,5.0,11.0,1,5,0.0,35.0,1498.0,37.0,176.0,243.0,http://r.newsletters.analyticsvidhya.com/7um44...,18,0.0,0.0,0.0,4.090909,1,7,12.0,61.0,3.0,18.181818,1.0,,67.0,185580,100.0,2017
17,29,0.0,3,0.0,4.0,55.0,2.0,1.0,3.0,11.0,7.0,11.0,1,5,0.0,35.0,1498.0,37.0,176.0,243.0,http://r.newsletters.analyticsvidhya.com/7um44...,18,0.0,0.0,0.571429,4.090909,5,7,12.0,61.0,3.0,18.181818,1.0,,67.0,103665,100.0,2017
61,29,0.0,3,0.0,0.0,55.0,2.0,1.0,3.0,11.0,9.0,11.0,1,5,0.0,35.0,1498.0,37.0,176.0,243.0,http://r.newsletters.analyticsvidhya.com/7um44...,18,0.0,0.0,0.0,4.090909,4,7,12.0,61.0,3.0,18.181818,1.0,,67.0,219721,100.0,2017
83,29,0.125,3,1.0,1.0,55.0,2.0,1.0,3.0,11.0,8.0,11.0,1,5,0.0,35.0,1498.0,37.0,176.0,243.0,http://r.newsletters.analyticsvidhya.com/7um44...,18,0.0,0.0,0.125,4.090909,9,7,12.0,61.0,3.0,18.181818,1.0,,67.0,164941,100.0,2017
92,29,0.25,3,1.0,2.0,55.0,2.0,1.0,3.0,11.0,4.0,11.0,1,5,0.0,35.0,1498.0,37.0,176.0,243.0,http://r.newsletters.analyticsvidhya.com/7um44...,18,0.0,0.0,0.5,4.090909,3,7,12.0,61.0,3.0,18.181818,1.0,,67.0,159970,100.0,2017


In [14]:
ids['end_date'] = 0
ids['end_month'] = 0
ids['end_year'] = 0

In [22]:
for i in tqdm(ids.index):
    df = data[data['user_id'] == ids.loc[i, 'ids']]
    ids.loc[i, ['start_year', 'start_month','start_date']] = np.array(df.iloc[0][['year', 'month', 'date']])
    ids.loc[i, ['end_year', 'end_month','end_date']] = np.array(df.iloc[-1][['year', 'month', 'date']])





In [23]:
ids.head()

Unnamed: 0,ids,count_click,count_user,click_confidence,start_year,start_month,start_date,end_date,end_month,end_year
0,14051,0.0,9.0,0.0,2017,7,5,9,3,2018
1,134438,0.0,4.0,0.0,2017,10,2,1,3,2018
2,181789,0.0,7.0,0.0,2017,7,1,7,3,2018
3,231448,0.0,6.0,0.0,2017,8,1,5,3,2018
4,185580,0.0,5.0,0.0,2017,7,1,6,2,2018


In [24]:
from datetime import date

ids['sub_peroid']=0
ids.head()

Unnamed: 0,ids,count_click,count_user,click_confidence,start_year,start_month,start_date,end_date,end_month,end_year,sub_peroid
0,14051,0.0,9.0,0.0,2017,7,5,9,3,2018,0
1,134438,0.0,4.0,0.0,2017,10,2,1,3,2018,0
2,181789,0.0,7.0,0.0,2017,7,1,7,3,2018,0
3,231448,0.0,6.0,0.0,2017,8,1,5,3,2018,0
4,185580,0.0,5.0,0.0,2017,7,1,6,2,2018,0


In [29]:
for i in tqdm(ids.index):
    d0 = date(ids.loc[i, 'start_year'], ids.loc[i, 'start_month'], ids.loc[i, 'start_date'])
    d1 = date(ids.loc[i, 'end_year'], ids.loc[i, 'end_month'], ids.loc[i, 'end_date'])
    delta = d1-d0
    ids.loc[i, 'sub_peroid']=delta.days 




In [37]:
ids.head()

Unnamed: 0,ids,count_click,count_user,click_confidence,start_year,start_month,start_date,end_date,end_month,end_year,sub_peroid
0,14051,0.0,9.0,0.0,2017,7,5,9,3,2018,247
1,134438,0.0,4.0,0.0,2017,10,2,1,3,2018,150
2,181789,0.0,7.0,0.0,2017,7,1,7,3,2018,249
3,231448,0.0,6.0,0.0,2017,8,1,5,3,2018,216
4,185580,0.0,5.0,0.0,2017,7,1,6,2,2018,220


In [30]:
ids.to_csv('input/ids.csv')

In [31]:
train['sub_period'] = 0
test['sub_period'] = 0

In [None]:
for i in tqdm(ids.index):
    train.loc[train['user_id'] == ids.loc[i, 'ids'], 'sub_period'] = ids.loc[i, 'sub_peroid'] 
    test.loc[test['user_id'] == ids.loc[i, 'ids'], 'sub_period'] = ids.loc[i, 'sub_peroid']

182668/|/ 83%|| 182668/220718 [1:07:41<14:06, 44.97it/s]

## Prominent Features Generated
- Date 
- Time (in minutes)
- Day of Week
- Communication Type
- Total Links
- No of Internal Links, No of Images
- Subject - Count of Sentences, Letters, Punctuations and Stopwords
- Subject - Unique Word Percentage
- Subject - Punctuation Percentage
- Email - Count of Word, Punctuation and Capital Letters
- Count Click
- Count User
- Click Confidence
- Count of People Opening the Mail
- Open Confidence
- Email Similarity, Subject Similarity
- Subscription Period
- Communication Type Click Percentage
- Count User Frequency
- Sentiment of Mail

## Save All Features

In [None]:
train.to_csv('input/train_featv3.csv', index=False)
test.to_csv('input/test_featv3.csv', index=False)

# End