In [1]:
import pandas as pd
import numpy as np

from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer


from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
train_path = 'dataset/train.tsv'
test_path = 'dataset/test.tsv'
sub_path = 'dataset/sampleSubmission.csv'

train = pd.read_csv(train_path, sep='\t')
test = pd.read_csv(test_path, sep='\t')
sub = pd.read_csv(sub_path, sep=',')

train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
train.loc[train.SentenceId==2]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
63,64,2,"This quiet , introspective and entertaining in...",4
64,65,2,"This quiet , introspective and entertaining in...",3
65,66,2,This,2
66,67,2,"quiet , introspective and entertaining indepen...",4
67,68,2,"quiet , introspective and entertaining",3
68,69,2,quiet,2
69,70,2,", introspective and entertaining",3
70,71,2,introspective and entertaining,3
71,72,2,introspective and,3
72,73,2,introspective,2


## Check data information

In [4]:
print(
    'Average count of phrases per sentence in train is {0:.0f}.'
    .format(train.groupby('SentenceId')['Phrase'].count().mean())
)
print(
    'Average count of phrases per sentence in test is {0:.0f}.'
    .format(test.groupby('SentenceId')['Phrase'].count().mean())
)

Average count of phrases per sentence in train is 18.
Average count of phrases per sentence in test is 20.


In [5]:
print(
    'Number of phrases in train: {}. Number of sentences in train: {}.'
    .format(train.shape[0], len(train.SentenceId.unique()))
)
print(
    'Number of phrases in test: {}. Number of sentences in test: {}.'
    .format(test.shape[0], len(test.SentenceId.unique()))
)

Number of phrases in train: 156060. Number of sentences in train: 8529.
Number of phrases in test: 66292. Number of sentences in test: 3310.


In [6]:
print(
    'Average word length of phrases in train is {0:.0f}.'
    .format(np.mean(train['Phrase'].apply(lambda x: len(x.split()))))
)
print(
    'Average word length of phrases in test is {0:.0f}.'
    .format(np.mean(test['Phrase'].apply(lambda x: len(x.split()))))
)

Average word length of phrases in train is 7.
Average word length of phrases in test is 7.


## Frequency of N-gram words

In [7]:
# 3-gram words frequency

text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]
Counter(text_trigrams).most_common(5)

[(('one', 'of', 'the'), 199),
 (('of', 'the', 'year'), 103),
 (('.', 'is', 'a'), 87),
 (('of', 'the', 'best'), 80),
 (('of', 'the', 'most'), 70)]

In [8]:
# frequency of 3-gram for 4 Sentiment value

text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text = [i for i in text.split() if i not in stopwords.words('english')]
text_trigrams = [i for i in ngrams(text, 3)]
Counter(text_trigrams).most_common(5)

[((',', 'funny', ','), 33),
 (('one', 'year', "'s"), 28),
 (('year', "'s", 'best'), 26),
 (('movies', 'ever', 'made'), 19),
 ((',', 'solid', 'cast'), 19)]

# Preprocessing

In [9]:
tokenizer = TweetTokenizer()

vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(train['Phrase'].values) + list(test['Phrase'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['Phrase'])
test_vectorized = vectorizer.transform(test['Phrase'])

y = train['Sentiment']

logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)



In [10]:
%%time
ovr.fit(train_vectorized, y)



CPU times: user 13.3 s, sys: 504 ms, total: 13.8 s
Wall time: 14.7 s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [11]:
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 56.55%, std 0.07.


In [12]:
%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 56.51%, std 0.68.
CPU times: user 97.1 ms, sys: 61.9 ms, total: 159 ms
Wall time: 45.7 s


In [13]:
ovr.fit(train_vectorized, y);
svc.fit(train_vectorized, y);



# predict the competition test data

In [15]:

# pred = ovr.predict(test_vectorized)

# sub['Sentiment'] = pred

# sub.to_csv("pred_Logistic.csv", index=False)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)