In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB as MNB
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [3]:
X_train = pd.read_csv("train.tsv", sep="\t")
X_test = pd.read_csv("test.tsv", sep="\t")

In [4]:
print(X_train.shape)
print(list(X_train))

(156060, 4)
['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']


In [5]:
categories = X_train['Sentiment'].unique()

In [6]:
count_vector = CountVectorizer()
X_train_count = count_vector.fit_transform(X_train['Phrase'])

In [7]:
print(X_train_count.shape)

(156060, 15240)


In [8]:
print(X_train_count[0])

  (0, 12857)	1
  (0, 8807)	1
  (0, 13681)	1
  (0, 593)	1
  (0, 9085)	1
  (0, 1879)	1
  (0, 602)	1
  (0, 9204)	1
  (0, 14888)	2
  (0, 12424)	1
  (0, 5595)	1
  (0, 529)	1
  (0, 5837)	1
  (0, 5323)	2
  (0, 5821)	2
  (0, 7217)	2
  (0, 14871)	1
  (0, 13503)	1
  (0, 288)	1
  (0, 13505)	3
  (0, 3490)	1
  (0, 4577)	1
  (0, 9227)	4
  (0, 11837)	1


In [9]:
count_vector.vocabulary_.get(u'movie')

8791

In [10]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [11]:
count_vector.get_feature_names()

['000',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '10th',
 '11',
 '110',
 '112',
 '12',
 '120',
 '127',
 '129',
 '12th',
 '13',
 '13th',
 '14',
 '140',
 '146',
 '15',
 '15th',
 '16',
 '163',
 '168',
 '170',
 '1790',
 '18',
 '1899',
 '19',
 '1915',
 '1920',
 '1930s',
 '1933',
 '1937',
 '1938',
 '1940s',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1957',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1975',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1986',
 '1987',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '2000',
 '2001',
 '2002',
 '20th',
 '21',
 '21st',
 '22',
 '24',
 '2455',
 '25',
 '26',
 '270',
 '295',
 '30',
 '300',
 '3000',
 '30s',
 '37',
 '3d',
 '40',
 '40s',
 '42',
 '451',
 '48',
 '4ever',
 '4th',
 '4w',
 '50',
 '500',
 '50s',
 '51',
 '51st',
 '52',
 '53',
 '5ths',
 '60',
 '60s',
 '65',
 '65th',
 '66',
 '70',
 '70s',
 '71',
 '72',
 '75',
 '77',
 '78',
 '7th',
 '8

In [12]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_count)

In [13]:
X_train_tf = tf_transformer.transform(X_train_count)

In [14]:
tfidf_transformer = TfidfTransformer()

In [15]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)

In [17]:
CV = 2
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MNB(),
    LogisticRegression(random_state=0),
]

for model in models:
    accuracy = cross_val_score(model,X_train_tfidf,X_train['Sentiment'],\
                            scoring='accuracy',cv=CV)
    print(str(model),":",accuracy)
    print("\n\n")

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False) : [0.50993836 0.50995143]



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) : [0.58038472 0.57417114]



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) : [0.55471543 0.55223058]



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          ve

In [18]:
classf = MNB().fit(X_train_tfidf, X_train['Sentiment'])
LSVC = LinearSVC().fit(X_train_tfidf, X_train['Sentiment'])

In [19]:
X_test_counts = count_vector.transform(X_test['Phrase'])

In [20]:
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [21]:
predicted = classf.predict(X_test_tfidf)
predicted_LSVC = LSVC.predict(X_test_tfidf)

In [59]:
with open('Sentiment_Naive_Bayes2.csv', 'w') as csvfile:
    csvfile.write('PhraseId,Sentiment\n')
    for i,j in zip(X_test['PhraseId'],predicted):
        csvfile.write('{}, {}\n'.format(i, j))

In [23]:
with open('Sentiment_LSVC.csv', 'w') as csvfile:
    csvfile.write('PhraseId,Sentiment\n')
    for i,j in zip(X_test['PhraseId'],predicted_LSVC):
        csvfile.write('{}, {}\n'.format(i, j))