In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [2]:
tr_f = './Data/train.tsv'
train = pd.DataFrame.from_csv(tr_f, sep='\t')

In [3]:
train[train.SentenceId == 2]

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
64,2,"This quiet , introspective and entertaining in...",4
65,2,"This quiet , introspective and entertaining in...",3
66,2,This,2
67,2,"quiet , introspective and entertaining indepen...",4
68,2,"quiet , introspective and entertaining",3
69,2,quiet,2
70,2,", introspective and entertaining",3
71,2,introspective and entertaining,3
72,2,introspective and,3
73,2,introspective,2


In [4]:
english_sw = stopwords.words('english')

In [9]:
from nltk.tokenize import RegexpTokenizer

def tokenize_stopwords(df):
    # Tokenize and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(df['Phrase'])
    #tokens = nltk.word_tokenize(df['Phrase'])
    return [t.lower() for t in tokens if t.lower() not in (english_sw + ['rrb', 'lrb'])] 

def keep_first(group):
    return pd.Series({"Phrase": group["Phrase"].iloc[0], "Sentiment": group["Sentiment"].iloc[0]})

In [10]:
#Keep only first  full sentence
full = train.copy()
full = full.groupby("SentenceId").apply(keep_first)

In [11]:
full.head()

Unnamed: 0_level_0,Phrase,Sentiment
SentenceId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,1
2,"This quiet , introspective and entertaining in...",4
3,"Even fans of Ismail Merchant 's work , I suspe...",1
4,A positively thrilling combination of ethnogra...,3
5,Aggressive self-glorification and a manipulati...,1


In [12]:
full['Phrase tokenized'] = full.apply(tokenize_stopwords, axis=1)

In [13]:
full.head()

Unnamed: 0_level_0,Phrase,Sentiment,Phrase tokenized
SentenceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A series of escapades demonstrating the adage ...,1,"[series, escapades, demonstrating, adage, good..."
2,"This quiet , introspective and entertaining in...",4,"[quiet, introspective, entertaining, independe..."
3,"Even fans of Ismail Merchant 's work , I suspe...",1,"[even, fans, ismail, merchant, work, suspect, ..."
4,A positively thrilling combination of ethnogra...,3,"[positively, thrilling, combination, ethnograp..."
5,Aggressive self-glorification and a manipulati...,1,"[aggressive, self, glorification, manipulative..."


In [14]:
full['Phrase tokenized'].iloc[0]

['series',
 'escapades',
 'demonstrating',
 'adage',
 'good',
 'goose',
 'also',
 'good',
 'gander',
 'occasionally',
 'amuses',
 'none',
 'amounts',
 'much',
 'story']

In [15]:
full.shape[0]

8529

### Create a dictionary of unique words in the the most frequent order

In [16]:
# Create unique list
uniques = []
for i in range(full.shape[0]):
    for word in full['Phrase tokenized'].iloc[i]:
        if word not in uniques:
            uniques.append(word)
print(len(uniques))

15065


In [None]:
# Create count list
# PLIZ DON'T DO THIS!!!
counts = []
for unique in uniques:
    count = 0              # Initialize the count to zero.
    for i in range(full.shape[0]):
        for word in full['Phrase tokenized'].iloc[i]:     # Iterate over the words.
            if word == unique:   # Is this word equal to the current unique?
                count += 1         # If so, increment the count
    counts.append((count, unique))
    
counts.sort()
counts.reverse()

In [17]:
from collections import Counter
words = []
for i in range(full.shape[0]):
    for word in full['Phrase tokenized'].iloc[i]:
        words.append(word)
dic1 = Counter(words)
print(len(dic1))

15065


In [23]:
import operator
sorted_words = sorted(dic1.items(), key=operator.itemgetter(1), reverse=True)
#print(sorted_words)
maxDictLength = len(dic1)
word_dict = dict([ (sorted_words[i][0], i+3)for i in range(maxDictLength)])
#print(word_dictionary)
#sorted_dic = sorted(word_dictionary.items(), key=operator.itemgetter(1))
#print(sorted_dic)

In [20]:
def words_to_dict(row):
    return [word_dict[r] for r in row["Phrase tokenized"]]

In [21]:
full["Dict values"] = full.apply(words_to_dict, axis=1)

In [22]:
full.head()

Unnamed: 0_level_0,Phrase,Sentiment,Phrase tokenized,Dict values
SentenceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,A series of escapades demonstrating the adage ...,1,"[series, escapades, demonstrating, adage, good...","[217, 9501, 5158, 4390, 6, 3427, 55, 6, 10700,..."
2,"This quiet , introspective and entertaining in...",4,"[quiet, introspective, entertaining, independe...","[551, 4992, 90, 3258, 116, 1947]"
3,"Even fans of Ismail Merchant 's work , I suspe...",1,"[even, fans, ismail, merchant, work, suspect, ...","[9, 191, 10161, 2433, 23, 2751, 19, 67, 10, 87..."
4,A positively thrilling combination of ethnogra...,3,"[positively, thrilling, combination, ethnograp...","[2827, 3727, 1277, 11224, 1289, 3813, 13408, 6..."
5,Aggressive self-glorification and a manipulati...,1,"[aggressive, self, glorification, manipulative...","[4674, 68, 7383, 905, 8581]"


In [None]:
full[full.Sentiment == 2]["Phrase"].iloc[:10].apply(lambda x: print("\n"+x))

#print("-------------------------------------------")
#full[full.Sentiment == 3]["Phrase"].iloc[:5].apply(print)

In [None]:
ratios = full.groupby('Sentiment').count()
ratios.drop(['Phrase'], axis=1, inplace = True)
ratios.columns = ['Count']
ratios

In [None]:
full["Logical Sentiment"] = full.Sentiment >= 3

In [None]:
full.head()