In [5]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200

from sklearn.preprocessing import OneHotEncoder , LabelEncoder
import textblob

import os
os.environ['PYTHONHASHSEED'] = '10000'
np.random.seed(10001)
import random
import tensorflow as tf
random.seed(10002)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=6, inter_op_parallelism_threads=5)
from keras import backend

tf.set_random_seed(10003)
backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

%matplotlib inline

In [93]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train.shape , test.shape

le = LabelEncoder()
train['sentiment_class']= le.fit_transform(train['sentiment_class'])
enc=OneHotEncoder(sparse=False)
enc.fit(train['sentiment_class'].values.reshape(-1,1))
print('number of classes',enc.n_values_[0])

print('class distribution\n',train['sentiment_class'].value_counts()/train.shape[0])
train.head(5)

number of classes 3
class distribution
 1    0.525811
0    0.237713
2    0.236476
Name: sentiment_class, dtype: float64


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers out there! I know it's hard not being able to see your mothers today but it's on all of us to do what we can to protect the most vulnerable members of ...,en,0,BeenXXPired,1
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be there to bring you Mothers day flowers & a cwtch - honestly at this point I'd walk on hot coals to be able to. But I'll be there with bells on as soon ...,en,1,FestiveFeeling,1
2,1.246087e+18,"Happy mothers day To all This doing a mothers days work. Today been quiet but Had time to reflect. Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin ...",en,0,KrisAllenSak,0
3,1.244803e+18,Happy mothers day to this beautiful woman...royalty soothes you mummy jeremy and emerald and more #PrayForRoksie #UltimateLoveNG pic.twitter.com/oeetI22Pvv,en,0,Queenuchee,1
4,1.244876e+18,"Remembering the 3 most amazing ladies who made me who I am! My late grandmother iris, mum carol and great grandmother Ethel. Missed but never forgotten! Happy mothers day to all those great mums o...",en,0,brittan17446794,0


In [94]:
train.shape

(3235, 6)

words in test set occurs in train set

In [95]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer()
cv1.fit(train["original_text"])

cv2 = CountVectorizer()
cv2.fit(test["original_text"])

print("Train Set Vocabulary Size:", len(cv1.vocabulary_))
print("Test Set Vocabulary Size:", len(cv2.vocabulary_))
print("Number of Words that occur in both:", len(set(cv1.vocabulary_.keys()).intersection(set(cv2.vocabulary_.keys()))))

Train Set Vocabulary Size: 14852
Test Set Vocabulary Size: 7948
Number of Words that occur in both: 3187


In [96]:
train.sentiment_class.value_counts()

1    1701
0     769
2     765
Name: sentiment_class, dtype: int64

In [97]:
length = []
[length.append(len(str(text))) for text in train['original_text']]
train['length'] = length
train.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class,length
0,1.245025e+18,Happy #MothersDay to all you amazing mothers out there! I know it's hard not being able to see your mothers today but it's on all of us to do what we can to protect the most vulnerable members of ...,en,0,BeenXXPired,1,252
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be there to bring you Mothers day flowers & a cwtch - honestly at this point I'd walk on hot coals to be able to. But I'll be there with bells on as soon ...,en,1,FestiveFeeling,1,301
2,1.246087e+18,"Happy mothers day To all This doing a mothers days work. Today been quiet but Had time to reflect. Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin ...",en,0,KrisAllenSak,0,298
3,1.244803e+18,Happy mothers day to this beautiful woman...royalty soothes you mummy jeremy and emerald and more #PrayForRoksie #UltimateLoveNG pic.twitter.com/oeetI22Pvv,en,0,Queenuchee,1,155
4,1.244876e+18,"Remembering the 3 most amazing ladies who made me who I am! My late grandmother iris, mum carol and great grandmother Ethel. Missed but never forgotten! Happy mothers day to all those great mums o...",en,0,brittan17446794,0,254


In [98]:
min(train['length']), max(train['length']), round(sum(train['length'])/len(train['length']))

(73, 728, 227)

In [101]:
len(train[train['length'] > 350])

5

In [102]:
len(train[train['length'] < 100])

4

In [103]:
train['original_text'][train['length'] > 350]

379     Happy Mother's Day to all the wonderful mothers - new, old and never forgotten. #MothersDay Our Rotunda gallery features an oil painting of the Queen Mother. You can explore the portrait on our on...
996     Happy Mothers Day from a Limerick family #ilovelimerick #lovelimerick #lovinlimerick https://www. facebook.com/ilovelimerick/ videos/196400795141787/?__tn__=kC-R&eid=ARD84aop8ara8GJPEbCe17J9_V2KrQ...
1599    Happy #Mothersday to all of the wonderful mums out there! What did you get your mum this year? Hopefully it was something both useful and #beautiful ...something like our Scandi saucepans! Here on...
2432    The answer to yesterday’s nature quiz was buff-tip moth. Happy MOTHers day !! The buff-tip looks just like the twig of a birch tree when it is at rest! Read more here: https://www. bbcwildlife.org...
2448    Happy #MothersDay to this fab #womxn ! She’s the womxn that brings the most love & light into my life. Thank you, #Mama ! I’m SO grateful for you and for everyt

In [104]:
# dropping the outliers
train = train.drop(train['original_text'][train['length'] < 100].index, axis = 0)
# dropping the outliers
train = train.drop(train['original_text'][train['length'] > 350].index, axis = 0)

# Data Augmentation for text

In [105]:
from nltk import sent_tokenize
import json
random.seed(1994)
def tokenize(text):
    '''text: list of text documents'''
    tokenized =  sent_tokenize(text)
    return tokenized

def shuffle_tokenized(text):
    random.shuffle(text)
    newl=list(text)
    shuffled.append(newl)
    return text
df_train=train[['original_text','sentiment_class']]

In [106]:
augmented = []
reps=[]
for ng_rev in df_train[df_train.sentiment_class==2].original_text:
    tok = tokenize(ng_rev)
    shuffled= [tok]
    #print(ng_rev)
    for i in range(2):
    #generate 11 new reviews
        shuffle_tokenized(shuffled[-1])
    for k in shuffled:
        '''create new review by joining the shuffled sentences'''
        s = ' '
        new_rev = s.join(k)
        if new_rev not in augmented:
            augmented.append(new_rev)
        else:
            reps.append(new_rev)
df2=pd.DataFrame({'original_text':augmented,'sentiment_class':[2]*len(augmented)})
print(df2.shape)
df2.head()

(1285, 2)


Unnamed: 0,original_text,sentiment_class
0,Happy Mother’s Day to all you wonderful mums out there! instagram.com/p/B-CUzHjg-Au/ ?igshid=lf3elmlv7q7g … #mothersday #cupcakes #supermums #traceyscakecraft #couturesugarpaste #saracinouk #hinck...,2
1,Show me a mother of two as sexy as Rosie Happy Mothers Day beautiful Rosie @officiallrosie I love you #HappyMothersDayRosie #HappyMothersDay2020 #WhyILoveRoksie #Roksie pic.twitter.com/6oMZGHcIVe,2
2,"To all the mothers, grandmothers, aunts, sisters and women in our lives who care for and love us unconditionally, Happy Mothers’ Day ! #MothersDay pic.twitter.com/ejatt1aH4z",2
3,"Then click the Bell icon to get notifications of new videos. https:// buff.ly/2QAIlzV pic.twitter.com/Askk1nKFvu HAPPY MOTHERS DAY , CHRISTINE | DAILY LIFE For the full video click the link below....",2
4,"Then click the Bell icon to get notifications of new videos. If you enjoy this video, please share it and click Subscribe on YouTube for more like it. https:// buff.ly/2QAIlzV pic.twitter.com/Askk...",2


In [61]:
augmented = []
reps=[]
for ng_rev in df_train[df_train.sentiment_class==0].original_text:
    tok = tokenize(ng_rev)
    shuffled= [tok]
    #print(ng_rev)
    for i in range(2):
    #generate 11 new reviews
        shuffle_tokenized(shuffled[-1])
    for k in shuffled:
        '''create new review by joining the shuffled sentences'''
        s = ' '
        new_rev = s.join(k)
        if new_rev not in augmented:
            augmented.append(new_rev)
        else:
            reps.append(new_rev)
df0=pd.DataFrame({'original_text':augmented,'sentiment_class':[0]*len(augmented)})
print(df0.shape)
df0.head()

(1283, 2)


Unnamed: 0,original_text,sentiment_class
0,"Today been quiet but Had time to reflect. Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin and tonic and watch Lee evens on DVD. My favourite place ...",0
1,"Today been quiet but Had time to reflect. My favourite place to visit. #isolate pic.twitter.com/GZ0xVvF6f9 Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawbe...",0
2,"Missed but never forgotten! My late grandmother iris, mum carol and great grandmother Ethel. Remembering the 3 most amazing ladies who made me who I am! Happy mothers day to all those great mums o...",0
3,"Missed but never forgotten! My late grandmother iris, mum carol and great grandmother Ethel. Happy mothers day to all those great mums out there! Remembering the 3 most amazing ladies who made me ...",0
4,"This is the 4th Round game between me and @CastigersJ Live coverage on @Twitter , maybe one day @SkySportsRL or on the OurLeague app Happy Mothers Day to everyone tuning in.",0


In [62]:
augmented = []
reps=[]
for ng_rev in df_train[df_train.sentiment_class==1].original_text:
    tok = tokenize(ng_rev)
    shuffled= [tok]
    #print(ng_rev)
    for i in range(2):
    #generate 11 new reviews
        shuffle_tokenized(shuffled[-1])
    for k in shuffled:
        '''create new review by joining the shuffled sentences'''
        s = ' '
        new_rev = s.join(k)
        if new_rev not in augmented:
            augmented.append(new_rev)
        else:
            reps.append(new_rev)
df1=pd.DataFrame({'original_text':augmented,'sentiment_class':[1]*len(augmented)})
print(df1.shape)
df1.head()

(2863, 2)


Unnamed: 0,original_text,sentiment_class
0,Happy #MothersDay to all you amazing mothers out there! I know it's hard not being able to see your mothers today but it's on all of us to do what we can to protect the most vulnerable members of ...,1
1,https:// photos.app.goo.gl/M3vXBLrsCzD4TE bY7 … Happy Mothers Day Mum - I'm sorry I can't be there to bring you Mothers day flowers & a cwtch - honestly at this point I'd walk on hot coals to be a...,1
2,https:// photos.app.goo.gl/M3vXBLrsCzD4TE bY7 … Love you lots xxx (p.s we need more photos!) Happy Mothers Day Mum - I'm sorry I can't be there to bring you Mothers day flowers & a cwtch - honestl...,1
3,Happy mothers day to this beautiful woman...royalty soothes you mummy jeremy and emerald and more #PrayForRoksie #UltimateLoveNG pic.twitter.com/oeetI22Pvv,1
4,HAPPY MOTHER’S DAY! BRATZ WORLD FAMILIEZ YASMIN & HER MUM! (PORTIA) #bratz #bratzworldfamiliez #bratz2008 #bratzkidz #bratzyasmin #bratzportia #theonlygirlswithapassionforfashion #bratzdolls #brat...,1


In [63]:
df0.shape

(1283, 2)

In [64]:
df0 = df0.append(df1)
df0 = df0.append(df2)
df0.shape

(5436, 2)

In [65]:
df0.sentiment_class.value_counts()

1    2863
2    1290
0    1283
Name: sentiment_class, dtype: int64

In [66]:
train = df0

In [67]:
train.head()

Unnamed: 0,original_text,sentiment_class
0,"Today been quiet but Had time to reflect. Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin and tonic and watch Lee evens on DVD. My favourite place ...",0
1,"Today been quiet but Had time to reflect. My favourite place to visit. #isolate pic.twitter.com/GZ0xVvF6f9 Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawbe...",0
2,"Missed but never forgotten! My late grandmother iris, mum carol and great grandmother Ethel. Remembering the 3 most amazing ladies who made me who I am! Happy mothers day to all those great mums o...",0
3,"Missed but never forgotten! My late grandmother iris, mum carol and great grandmother Ethel. Happy mothers day to all those great mums out there! Remembering the 3 most amazing ladies who made me ...",0
4,"This is the 4th Round game between me and @CastigersJ Live coverage on @Twitter , maybe one day @SkySportsRL or on the OurLeague app Happy Mothers Day to everyone tuning in.",0


In [68]:
train[train.sentiment_class==2].shape,train[train.sentiment_class==0].shape,train[train.sentiment_class==1].shape

((1290, 2), (1283, 2), (2863, 2))

In [69]:
train.sentiment_class.value_counts()

1    2863
2    1290
0    1283
Name: sentiment_class, dtype: int64

In [70]:
train['original_text'].values

array(['Today been quiet but Had time to reflect. Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin and tonic and watch Lee evens on DVD. My favourite place to visit. Happy mothers day To all This doing a mothers days work. #isolate pic.twitter.com/GZ0xVvF6f9',
       'Today been quiet but Had time to reflect. My favourite place to visit. #isolate pic.twitter.com/GZ0xVvF6f9 Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin and tonic and watch Lee evens on DVD. Happy mothers day To all This doing a mothers days work.',
       'Missed but never forgotten! My late grandmother iris, mum carol and great grandmother Ethel. Remembering the 3 most amazing ladies who made me who I am! Happy mothers day to all those great mums out there! Love sent to all xxxx pic.twitter.com/xZZZdEybjE',
       ...,
       'We love you NaLindi :Red velvet floral themed cupcakes https://www. instagram.com/p/B-DPvx4DOZF/ 

In [71]:
import string
punctuation=string.punctuation
from nltk.corpus import stopwords
stop = stopwords.words('english')
def transform(df):
    df["has_upper"] = df["original_text"].apply(lambda x: x.lower() != x)
    df['upper'] = df['original_text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    df["sentence_end"] = df["original_text"].apply(lambda x: x.endswith("."))
    df["after_comma"] = df["original_text"].apply(lambda x: x.startswith(","))
    df["sentence_start"] = df["original_text"].apply(lambda x: "A" <= x[0] <= "Z")
    df["text"] = df["original_text"].apply(lambda x: x.lower())
    df['word_count']=df['original_text'].apply(lambda x: len(str(x).split(" ")))
    df['char_count'] = df['original_text'].str.len()
    def avg_word(sentence):
        words = sentence.split()
        return (sum(len(word) for word in words)/len(words))

    df['avg_word'] = df['original_text'].apply(lambda x: avg_word(x))
    df['stopwords'] = df['original_text'].apply(lambda x: len([x for x in x.split() if x in stop]))
    df['numerics'] = df['original_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    df['word_density'] = df['char_count'] / (df['word_count']+1)
    df['punctuation_count'] = df['original_text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
    return df

In [72]:
import re
import nltk
import inflect
import contractions
from textblob import TextBlob
from bs4 import BeautifulSoup
import string, unicodedata
from nltk.corpus import stopwords

def remove_urls(raw_text):
    raw_text=raw_text.strip()
    no_ment =re.sub(r'@\w+', '',raw_text )
    link = re.sub(r'https?:\/\/?.*[\r\n]*','',no_ment)
    new_word = re.sub(r'!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~', '', link)
    no_urls1 = re.sub(r'pic.*[\r\n]*','',new_word)
    try:
        no_encoding=no_urls1.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        no_encoding = no_urls1
    return(no_urls1)

def denoise_text(text):
    # Strip html if any. For ex. removing <html>, <p> tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # Replace contractions in the text. For ex. didn't -> did not
    text = contractions.fix(text)
    return text
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    words = input_text.split()
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"] 
    clean_words = [word for word in input_text if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return(" ".join(clean_words))

def lemmatize_verbs(text):
    """Lemmatize verbs in list of tokenized words"""
    words = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return((" ".join(lemmas))

In [73]:
train['original_text'] = train['original_text'].apply(denoise_text)
train['original_text']= train['original_text'].apply(remove_urls)
train['original_text'] = train['original_text'].apply(remove_stopwords)
train['original_text'] = train['original_text'].apply(lemmatize_verbs)

array(['Today quiet Had time reflect Dog walk finish jigsaw garden learn guitar chords drunk strawberry gin tonic watch Lee evens DVD My favourite place visit Happy mothers day To This mothers days work isolate pic twitter GZ0xVvF6f9',
       'Today quiet Had time reflect My favourite place visit isolate pic twitter GZ0xVvF6f9 Dog walk finish jigsaw garden learn guitar chords drunk strawberry gin tonic watch Lee evens DVD Happy mothers day To This mothers days work',
       'Missed never forgotten My late grandmother iris mum carol great grandmother Ethel Remembering 3 amazing ladies made I Happy mothers day great mums Love sent xxxx pic twitter xZZZdEybjE',
       ...,
       'We love NaLindi Red velvet floral themed cupcakes instagram p B DPvx4DOZF igshid 1b5l1byb8c8mb HAPPY MOTHERS DAY This one yummy treat yummy mummies meet These inspired truly amazing woman mother',
       'Happy Mother Day Mums Step Mums Aunties Grandmothers amazing hairbyemmabroadbent hairstylist hairdresser hai

In [74]:
train['original_text'] = train['original_text'].apply(denoise_text)
train['original_text'].values

array(['Today been quiet but Had time to reflect. Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin and tonic and watch Lee evens on DVD. My favourite place to visit. Happy mothers day To all This doing a mothers days work. #isolate pic.twitter.com/GZ0xVvF6f9',
       'Today been quiet but Had time to reflect. My favourite place to visit. #isolate pic.twitter.com/GZ0xVvF6f9 Dog walk, finish a jigsaw do the garden, learn few more guitar chords, drunk some strawberry gin and tonic and watch Lee evens on DVD. Happy mothers day To all This doing a mothers days work.',
       'Missed but never forgotten! My late grandmother iris, mum carol and great grandmother Ethel. Remembering the 3 most amazing ladies who made me who I am! Happy mothers day to all those great mums out there! Love sent to all xxxx pic.twitter.com/xZZZdEybjE',
       ...,
       'We love you NaLindi :Red velvet floral themed cupcakes https://www. instagram.com/p/B-DPvx4DOZF/ 