In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("data/train.tsv",sep='\t',index_col ='PhraseId' )

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv('data/test.tsv',sep='\t',index_col='PhraseId')

print(test.shape)
train.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


## Preprocessing

In [5]:
train['Phrase(origin)'] = train['Phrase'].copy()

print(train.shape)
train[['Phrase','Phrase(origin)']].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [8]:
test['Phrase(origin)'] = test['Phrase'].copy()

print(train.shape)
test[['Phrase','Phrase(origin)']].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


## Clean Test

In [7]:
def clean_text(phrase):
    phrase = phrase.replace("doesn't","does not")
    phrase = phrase.replace("ca n't","can not")
    phrase = phrase.replace(" n't "," not ")
    
    return phrase

train['Phrase'] = train["Phrase"].apply(clean_text)

print(train.shape)
train[['Phrase','Phrase(origin)']].head()
    

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [9]:
test['Phrase'] = test['Phrase'].apply(clean_text)

print(test.shape)
test.head()

(66292, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,8545,An,An
156064,8545,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


## Vectorize phrases

In [99]:
from sklearn.feature_extraction.text import CountVectorizer
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# binary=True/False
# lowercase=True/False
# ngram_range=(1, 1)
# stop_words=None

vectorizer = CountVectorizer(max_features=20000,ngram_range=(1,2))
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [100]:
vectorizer.fit(train['Phrase'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [123]:
aa = vectorizer.get_feature_names()
aa.index('diappointments')

# help(list)

ValueError: 'diappointments' is not in list

In [101]:
x_train = vectorizer.transform(train['Phrase'])
x_train

<156060x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 1322766 stored elements in Compressed Sparse Row format>

In [42]:
words_list = []
vocabulary = vectorizer.get_feature_names()

for i in range(x_train.shape[0])
    vector = x_train[i].toarray().reshape(-1)
    vocabulary_pairs = list(zip(vocabulary, vector))

    words = [word for word, count in vocabulary_pairs if count != 0]
    
    words_list.append(words)
    
words_list

[['also',
  'amounts',
  'amounts to',
  'but',
  'but none',
  'for',
  'for the',
  'good',
  'good for',
  'goose',
  'is',
  'is also',
  'is good',
  'much',
  'much of',
  'none',
  'none of',
  'occasionally',
  'of',
  'of story',
  'of which',
  'series',
  'series of',
  'some',
  'some of',
  'story',
  'that',
  'the',
  'to',
  'to much',
  'what',
  'what is',
  'which'],
 ['for',
  'for the',
  'good',
  'good for',
  'goose',
  'is',
  'is good',
  'of',
  'series',
  'series of',
  'that',
  'the',
  'what',
  'what is'],
 ['series'],
 [],
 ['series'],
 ['for',
  'for the',
  'good',
  'good for',
  'goose',
  'is',
  'is good',
  'of',
  'that',
  'the',
  'what',
  'what is'],
 ['of'],
 ['for',
  'for the',
  'good',
  'good for',
  'goose',
  'is',
  'is good',
  'that',
  'the',
  'what',
  'what is'],
 [],
 ['for',
  'for the',
  'good',
  'good for',
  'goose',
  'is',
  'is good',
  'that',
  'the',
  'what',
  'what is'],
 ['the'],
 [],
 ['the'],
 ['the'],
 [],

In [43]:
x_test = vectorizer.transform(test['Phrase'])
x_test

<66292x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 421216 stored elements in Compressed Sparse Row format>

In [44]:
y_train=train['Sentiment']

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

## Score

In [45]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=37)
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [47]:
from sklearn.cross_validation import cross_val_predict

y_predict = cross_val_predict(model,x_train,y_train,cv=5)



In [50]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train,y_predict)
print("Score = {0:.5f}".format(score))

Score = 0.57656


In [84]:
import numpy as np

result = train.copy()
result['Sentiment(predict)'] = y_predict
result['Difference(Phrase)'] = np.abs(y_train - y_predict)

print(result.shape)
result.head()

(156060, 6)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...,1,0
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...,2,0
3,1,A series,2,A series,2,0
4,1,A,2,A,2,0
5,1,series,2,series,2,0


In [56]:
sentiment = result.groupby("SentenceId")["Difference(Phrase)"].mean()
print(sentiment.shape)
sentiment.head()

(8529,)


SentenceId
1    0.222222
2    0.444444
3    0.314286
4    0.675000
5    0.800000
Name: Difference(Phrase), dtype: float64

In [97]:
def find_sentiment(sentence_id):
    return sentiment.loc[sentence_id]

def find_words(phraseId_id):
    return words_list[phraseId_id-1]

result["Difference(Sentence)"] = result["SentenceId"].apply(find_sentiment)
result["Idx"] = result.index
result["Keywords"] = result['Idx'].apply(find_words)
result = result.sort_values(by="Difference(Sentence)",ascending=False)

print(result.shape)
result.head()

(156060, 9)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase),Difference(Sentence),Idx,Keywords
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
82700,4270,one of the biggest disappointments of the year,1,one of the biggest disappointments of the year,4,3,2.625,82700,"[biggest, of, of the, one, one of, the, the bi..."
82703,4270,the biggest disappointments,1,the biggest disappointments,2,1,2.625,82703,"[biggest, the, the biggest]"
82702,4270,the biggest disappointments of the year,1,the biggest disappointments of the year,2,1,2.625,82702,"[biggest, of, of the, the, the biggest, the ye..."
82701,4270,of the biggest disappointments of the year,0,of the biggest disappointments of the year,2,2,2.625,82701,"[biggest, of, of the, the, the biggest, the ye..."
82699,4270,is one of the biggest disappointments of the year,0,is one of the biggest disappointments of the year,4,4,2.625,82699,"[biggest, is, is one, of, of the, one, one of,..."


In [96]:
words_list[4270]

['diversion',
 'eight',
 'eight legged',
 'entertaining',
 'freaks',
 'legged',
 'legged freaks',
 'makes',
 'perfectly',
 'summer']

In [98]:
result[0:1000].to_csv("result.csv")

In [78]:
vocabulary = vectorizer.get_feature_names()
vocabulary[0:3]

['000', '10', '10 minutes']

In [79]:
pd.DataFrame(vocabulary,columns=["word"]).to_csv("vocabulary.csv")

In [80]:
result[result['Phrase'].str.contains("can not recommend")]

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase),Difference(Sentence),Keywords
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
79349,4087,I can not recommend it .,0,I ca n't recommend it .,2,2,2.0,"[an, an artist, artist, artist who, is, is sim..."
80730,4158,can not recommend it enough,4,ca n't recommend it enough,2,2,2.0,"[all, enveloping, experience, movie, movie exp..."
79350,4087,can not recommend it .,0,ca n't recommend it .,2,2,2.0,"[an, an artist, artist, artist who, is, is sim..."
80729,4158,can not recommend it enough .,4,ca n't recommend it enough .,2,2,2.0,"[all, enveloping, experience, movie, movie exp..."
80728,4158,simply can not recommend it enough .,4,simply ca n't recommend it enough .,0,4,2.0,"[all, enveloping, experience, movie, movie exp..."
80727,4158,I simply can not recommend it enough .,3,I simply ca n't recommend it enough .,0,3,2.0,"[all, enveloping, experience, movie, movie exp..."
22224,998,"I admire it and yet can not recommend it , bec...",1,"I admire it and yet can not recommend it , bec...",3,2,1.058824,[horror]
22230,998,"can not recommend it , because it overstays it...",1,"can not recommend it , because it overstays it...",3,2,1.058824,[horror]
22226,998,"admire it and yet can not recommend it , becau...",2,"admire it and yet can not recommend it , becau...",3,1,1.058824,[horror]
22225,998,"admire it and yet can not recommend it , becau...",1,"admire it and yet can not recommend it , becau...",3,2,1.058824,[horror]
