In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [3]:
tr_f = './Data/train.tsv'
train = pd.DataFrame.from_csv(tr_f, sep='\t')

In [4]:
train[train.SentenceId == 2]

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
64,2,"This quiet , introspective and entertaining in...",4
65,2,"This quiet , introspective and entertaining in...",3
66,2,This,2
67,2,"quiet , introspective and entertaining indepen...",4
68,2,"quiet , introspective and entertaining",3
69,2,quiet,2
70,2,", introspective and entertaining",3
71,2,introspective and entertaining,3
72,2,introspective and,3
73,2,introspective,2


In [5]:
#english_sw = stopwords.words('english')
english_sw = []

In [6]:
from nltk.tokenize import RegexpTokenizer

def tokenize_stopwords(df):
    # Tokenize and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(df['Phrase'])
    #tokens = nltk.word_tokenize(df['Phrase'])
    return [t.lower() for t in tokens if t.lower() not in (english_sw + ['rrb', 'lrb'])] 

def keep_first(group):
    return pd.Series({"Phrase": group["Phrase"].iloc[0], "Sentiment": group["Sentiment"].iloc[0]})

In [7]:
#Keep only first  full sentence
full = train.copy()
#full = full.groupby("SentenceId").apply(keep_first)

In [8]:
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [9]:
full['Phrase tokenized'] = full.apply(tokenize_stopwords, axis=1)

In [10]:
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase tokenized
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,..."
3,1,A series,2,"[a, series]"
4,1,A,2,[a]
5,1,series,2,[series]


In [11]:
full['Phrase tokenized'].iloc[60:70]

PhraseId
61                                           [a, story]
62                                              [story]
63                                                   []
64    [this, quiet, introspective, and, entertaining...
65    [this, quiet, introspective, and, entertaining...
66                                               [this]
67    [quiet, introspective, and, entertaining, inde...
68            [quiet, introspective, and, entertaining]
69                                              [quiet]
70                   [introspective, and, entertaining]
Name: Phrase tokenized, dtype: object

In [12]:
full.shape[0]

156060

### Create a dictionary of unique words in the the most frequent order

In [None]:
# Create unique list
uniques = []
for i in range(full.shape[0]):
    for word in full['Phrase tokenized'].iloc[i]:
        if word not in uniques:
            uniques.append(word)
print(len(uniques))

In [None]:
# Create count list
# PLIZ DON'T DO THIS!!!
counts = []
for unique in uniques:
    count = 0              # Initialize the count to zero.
    for i in range(full.shape[0]):
        for word in full['Phrase tokenized'].iloc[i]:     # Iterate over the words.
            if word == unique:   # Is this word equal to the current unique?
                count += 1         # If so, increment the count
    counts.append((count, unique))
    
counts.sort()
counts.reverse()

In [None]:
from collections import Counter
words = []
for i in range(full.shape[0]):
    for word in full['Phrase tokenized'].iloc[i]:
        words.append(word)
dic1 = Counter(words)
print(len(dic1))

In [None]:
import operator
sorted_words = sorted(dic1.items(), key=operator.itemgetter(1), reverse=True)
#print(sorted_words)
maxDictLength = len(dic1)
word_dict = dict([ (sorted_words[i][0], i+2)for i in range(maxDictLength)])
#print(word_dictionary)
#sorted_dic = sorted(word_dictionary.items(), key=operator.itemgetter(1))
#print(sorted_dic)
oovf = 1

In [22]:
def words_to_dict(row):
    return [[word_dict[r] if (r in word_dict) else oovf] for r in row["Phrase tokenized"]]

In [None]:
full["Dict values"] = full.apply(words_to_dict, axis=1)

In [None]:
full.head()

In [None]:
print(full["Dict values"].iloc[65])

## Using a stemmer on the data

In [18]:
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

def stemmer(row):
    eng_stemmer = SnowballStemmer('english')
    return [eng_stemmer.stem(word) for word in row["Phrase tokenized"]]

def lem_words(row):
    w_lemmatizer = WordNetLemmatizer()
    return [(w_lemmatizer.lemmatize(word, tag) if tag else w_lemmatizer.lemmatize(word)) for (word,tag) in row["PoS"]]

In [14]:
full["stemmed"] = full.apply(stemmer, axis=1)

In [15]:
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase tokenized,stemmed
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,...","[a, seri, of, escapad, demonstr, the, adag, th..."
2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,...","[a, seri, of, escapad, demonstr, the, adag, th..."
3,1,A series,2,"[a, series]","[a, seri]"
4,1,A,2,[a],[a]
5,1,series,2,[series],[seri]


In [16]:
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']


def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

def pos_tagging(df):
    pos_tags = nltk.pos_tag(df["Phrase tokenized"])
    return [(PoS[0], penn_to_wn(PoS[1])) for PoS in pos_tags] 

In [17]:
full["PoS"] = full.apply(pos_tagging, axis=1)
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase tokenized,stemmed,PoS
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,...","[a, seri, of, escapad, demonstr, the, adag, th...","[(a, None), (series, n), (of, None), (escapade..."
2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,...","[a, seri, of, escapad, demonstr, the, adag, th...","[(a, None), (series, n), (of, None), (escapade..."
3,1,A series,2,"[a, series]","[a, seri]","[(a, None), (series, n)]"
4,1,A,2,[a],[a],"[(a, None)]"
5,1,series,2,[series],[seri],"[(series, n)]"


In [19]:
full["lemmatized"] = full.apply(lem_words, axis=1)

In [20]:
full['lemmatized'].iloc[0:5]

PhraseId
1    [a, series, of, escapade, demonstrate, the, ad...
2    [a, series, of, escapade, demonstrate, the, ad...
3                                          [a, series]
4                                                  [a]
5                                             [series]
Name: lemmatized, dtype: object

In [154]:
full.iloc[1]

SentenceId                                                          1
Phrase              A series of escapades demonstrating the adage ...
Sentiment                                                           2
Phrase tokenized    [a, series, of, escapades, demonstrating, the,...
stemmed             [a, seri, of, escapad, demonstr, the, adag, th...
PoS                 [(a, None), (series, n), (of, None), (escapade...
lemmatized          [a, series, of, escapade, demonstrate, the, ad...
Dict values         [[3], [337], [4], [6122], [1408], [2], [5914],...
Name: 2, dtype: object

In [76]:
from collections import Counter
import operator

words = []
for i in range(full.shape[0]):
    for word in full['lemmatized'].iloc[i]:
        words.append(word)
counts = Counter(words)
print(len(counts))

OoV = 1
# Total extra char used
nb_extraChar = 2
max_words = 10000
maxDictLength = max_words - nb_extraChar

sorted_words = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
word_dict = dict([ (sorted_words[i][0], i+nb_extraChar)for i in range(maxDictLength)])

def words_to_dict(row):
    return [[word_dict[r] if (r in word_dict) else OoV] for r in row['lemmatized']]

# X and Y corresponds to the dictionnary values and Sentiment values
full["Dict values"] = full.apply(words_to_dict, axis=1)
x = np.array(full["Dict values"])
y = np.array(full["Sentiment"])

13467


### Create Train and Test DataSets

In [110]:
from keras.preprocessing import sequence

x = sequence.pad_sequences(x, 30)
x = x.reshape((x.shape[0], x.shape[1]))

min_length = min(len(x[y==0]), len(x[y==1]), len(x[y==2]), len(x[y==3]), len(x[y==4]))
print(min_length)

idx = np.random.permutation(np.arange(min_length))

tr_ratio = 0.8
tr_l = int(tr_ratio*min_length)
x_train = np.zeros((x.shape[1]))
for i in range(5):
    idx = np.random.permutation(np.arange(min_length))
    x_train_temp = x[y==i][idx[:tr_l]]
    x_test_temp = x[y==i][idx[tr_l:]]
    y_train_temp = y[y==i][idx[:tr_l]]
    y_test_temp = y[y==i][idx[tr_l:]]
    x_train = np.vstack((x_train, x_train_temp))

7072


In [166]:
np.argwhere(y==0)

array([[   101],
       [   103],
       [   157],
       ..., 
       [155970],
       [155971],
       [155973]])

In [168]:
from keras.preprocessing import sequence

x = sequence.pad_sequences(x, 30)
x = x.reshape((x.shape[0], x.shape[1]))

to_remove = len(x[y==2]) - len(x[y==3])
print(to_remove)

ind_class3 = np.argwhere(y==2)

print(ind_class3)

ind_rm = np.random.permutation(ind_class3)

print(ind_rm)

x_rm = np.delete(x, ind_rm[:to_remove], 0)
y_rm = np.delete(y, ind_rm[:to_remove], 0)

print(len(x_rm[y_rm==0]), len(x_rm[y_rm==1]), len(x_rm[y_rm==2]), len(x_rm[y_rm==3]), len(x_rm[y_rm==4]))

46655
[[     1]
 [     2]
 [     3]
 ..., 
 [156055]
 [156058]
 [156059]]
[[127810]
 [138758]
 [ 21646]
 ..., 
 [ 51183]
 [ 48928]
 [ 17063]]
7072 27273 32927 32927 9206


In [169]:
from sklearn.utils import class_weight

class_w = class_weight.compute_class_weight('balanced', np.unique(y), y)
print(class_w)

[ 4.41346154  1.14442856  0.39219924  0.94791508  3.39039757]


In [None]:
x = np.array(full["Dict values"])
y = np.array(full["Sentiment"])

# Binary class
y[y<=2]=0
y[y>2]=1

y_2 = np.array(full.Sentiment >=3)

print(sum(y!=y_2))

t_ratio = 0.8
tr_length = int(t_ratio*x.shape[0])

# Add randomization here
x_train = x[:tr_length]
x_test = x[tr_length:]
y_train = x[:tr_length]
y_test = x[tr_length:]

In [None]:
full[full.Sentiment == 2]["Phrase"].iloc[:10].apply(lambda x: print("\n"+x))

#print("-------------------------------------------")
#full[full.Sentiment == 3]["Phrase"].iloc[:5].apply(print)

In [None]:
ratios = full.groupby('Sentiment').count()
ratios.drop(['Phrase'], axis=1, inplace = True)
ratios.columns = ['Count']
ratios

In [None]:
full["Logical Sentiment"] = full.Sentiment >= 3

In [None]:
full.head()