In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [2]:
tr_f = './Data/train.tsv'
train = pd.DataFrame.from_csv(tr_f, sep='\t')

In [3]:
train[train.SentenceId == 2]

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
64,2,"This quiet , introspective and entertaining in...",4
65,2,"This quiet , introspective and entertaining in...",3
66,2,This,2
67,2,"quiet , introspective and entertaining indepen...",4
68,2,"quiet , introspective and entertaining",3
69,2,quiet,2
70,2,", introspective and entertaining",3
71,2,introspective and entertaining,3
72,2,introspective and,3
73,2,introspective,2


In [5]:
#english_sw = stopwords.words('english')
english_sw = []

In [6]:
from nltk.tokenize import RegexpTokenizer

def tokenize_stopwords(df):
    # Tokenize and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(df['Phrase'])
    #tokens = nltk.word_tokenize(df['Phrase'])
    return [t.lower() for t in tokens if t.lower() not in (english_sw + ['rrb', 'lrb'])] 

def keep_first(group):
    return pd.Series({"Phrase": group["Phrase"].iloc[0], "Sentiment": group["Sentiment"].iloc[0]})

In [7]:
#Keep only first  full sentence
full = train.copy()
#full = full.groupby("SentenceId").apply(keep_first)

In [8]:
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [9]:
full['Phrase tokenized'] = full.apply(tokenize_stopwords, axis=1)

In [10]:
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase tokenized
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,..."
3,1,A series,2,"[a, series]"
4,1,A,2,[a]
5,1,series,2,[series]


In [11]:
full['Phrase tokenized'].iloc[60:70]

PhraseId
61                                           [a, story]
62                                              [story]
63                                                   []
64    [this, quiet, introspective, and, entertaining...
65    [this, quiet, introspective, and, entertaining...
66                                               [this]
67    [quiet, introspective, and, entertaining, inde...
68            [quiet, introspective, and, entertaining]
69                                              [quiet]
70                   [introspective, and, entertaining]
Name: Phrase tokenized, dtype: object

In [12]:
full.shape[0]

156060

### Create a dictionary of unique words in the the most frequent order

In [None]:
# Create unique list
uniques = []
for i in range(full.shape[0]):
    for word in full['Phrase tokenized'].iloc[i]:
        if word not in uniques:
            uniques.append(word)
print(len(uniques))

In [None]:
# Create count list
# PLIZ DON'T DO THIS!!!
counts = []
for unique in uniques:
    count = 0              # Initialize the count to zero.
    for i in range(full.shape[0]):
        for word in full['Phrase tokenized'].iloc[i]:     # Iterate over the words.
            if word == unique:   # Is this word equal to the current unique?
                count += 1         # If so, increment the count
    counts.append((count, unique))
    
counts.sort()
counts.reverse()

In [13]:
from collections import Counter
words = []
for i in range(full.shape[0]):
    for word in full['Phrase tokenized'].iloc[i]:
        words.append(word)
dic1 = Counter(words)
print(len(dic1))

15273


In [14]:
import operator
sorted_words = sorted(dic1.items(), key=operator.itemgetter(1), reverse=True)
#print(sorted_words)
maxDictLength = len(dic1)
word_dict = dict([ (sorted_words[i][0], i+2)for i in range(maxDictLength)])
#print(word_dictionary)
#sorted_dic = sorted(word_dictionary.items(), key=operator.itemgetter(1))
#print(sorted_dic)
oovf = 1

In [15]:
def words_to_dict(row):
    return [[word_dict[r] if (r in word_dict) else oovf] for r in row["Phrase tokenized"]]

In [16]:
full["Dict values"] = full.apply(words_to_dict, axis=1)

In [17]:
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase tokenized,Dict values
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,...","[[3], [322], [4], [14244], [6060], [2], [6740]..."
2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,...","[[3], [322], [4], [14244], [6060], [2], [6740]..."
3,1,A series,2,"[a, series]","[[3], [322]]"
4,1,A,2,[a],[[3]]
5,1,series,2,[series],[[322]]


In [18]:
print(full["Dict values"].iloc[65])

[[19]]


## Using a stemmer on the data

In [39]:
from nltk.stem import SnowballStemmer

def stemmer(row):
    eng_stemmer = SnowballStemmer('english')
    return [eng_stemmer.stem(word) for word in row["Phrase tokenized"]]

In [42]:
full["stemmed"] = full.apply(stemmer, axis=1)

In [43]:
full.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase tokenized,Dict values,lemmatized,stemmed
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,...","[[3], [322], [4], [14244], [6060], [2], [6740]...","[a, seri, of, escapad, demonstr, the, adag, th...","[a, seri, of, escapad, demonstr, the, adag, th..."
2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,...","[[3], [322], [4], [14244], [6060], [2], [6740]...","[a, seri, of, escapad, demonstr, the, adag, th...","[a, seri, of, escapad, demonstr, the, adag, th..."
3,1,A series,2,"[a, series]","[[3], [322]]","[a, seri]","[a, seri]"
4,1,A,2,[a],[[3]],[a],[a]
5,1,series,2,[series],[[322]],[seri],[seri]


In [44]:
from collections import Counter
words = []
for i in range(full.shape[0]):
    for word in full['stemmed'].iloc[i]:
        words.append(word)
dic1 = Counter(words)
print(len(dic1))

10493


### Create Train and Test DataSets

In [None]:
x = np.array(full["Dict values"])
y = np.array(full["Sentiment"])

# Binary class
y[y<=2]=0
y[y>2]=1

y_2 = np.array(full.Sentiment >=3)

print(sum(y!=y_2))

t_ratio = 0.8
tr_length = int(t_ratio*x.shape[0])

# Add randomization here
x_train = x[:tr_length]
x_test = x[tr_length:]
y_train = x[:tr_length]
y_test = x[tr_length:]

In [None]:
full[full.Sentiment == 2]["Phrase"].iloc[:10].apply(lambda x: print("\n"+x))

#print("-------------------------------------------")
#full[full.Sentiment == 3]["Phrase"].iloc[:5].apply(print)

In [None]:
ratios = full.groupby('Sentiment').count()
ratios.drop(['Phrase'], axis=1, inplace = True)
ratios.columns = ['Count']
ratios

In [None]:
full["Logical Sentiment"] = full.Sentiment >= 3

In [None]:
full.head()