In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

## Data Preparation

In [6]:
df = pd.read_csv('comments_rating.csv', sep='\t',index_col = False)
del df['Unnamed: 0']
df.head()

Unnamed: 0,ID,Comments,Rating
0,7202016,Cute and cozy place. Perfect location to every...,95.0
1,7202016,Kelly has a great room in a very central locat...,95.0
2,7202016,"Very spacious apartment, and in a great neighb...",95.0
3,7202016,Close to Seattle Center and all it has to offe...,95.0
4,7202016,Kelly was a great host and very accommodating ...,95.0


In [7]:
df['Rating'].describe()

count    84829.000000
mean        94.896368
std          4.084989
min         20.000000
25%         93.000000
50%         96.000000
75%         98.000000
max        100.000000
Name: Rating, dtype: float64

In [30]:
def get_class(Rating):
    if Rating >= 98.0:
        return 'positive'
    elif Rating >= 95.0 and Rating < 98.0:
        return 'neutral'
    else:
        return 'negative'

In [31]:
df['category'] = df['Rating'].apply(get_class)

In [32]:
df.head()

Unnamed: 0,ID,Comments,Rating,category
0,7202016,Cute and cozy place. Perfect location to every...,95.0,neutral
1,7202016,Kelly has a great room in a very central locat...,95.0,neutral
2,7202016,"Very spacious apartment, and in a great neighb...",95.0,neutral
3,7202016,Close to Seattle Center and all it has to offe...,95.0,neutral
4,7202016,Kelly was a great host and very accommodating ...,95.0,neutral


In [33]:
#split data into a training and a test set
df['Comments'] = df['Comments'].values.astype('U')
X_train, X_test, y_train, y_test = train_test_split(df['Comments'], df['category'], random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(63636, 34328)

## linear support vector machine (SVM)

In [34]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.64559468250601049

## Naïve Bayes Classifier

In [35]:
#make the vectorizer => transformer => classifier easier to work with, scikit-learn provides a Pipeline class that behaves like a compound classifier:
text_clf2 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])

In [36]:
text_clf2 = text_clf2.fit(X_train, y_train)

In [37]:
#Evaluation of the performance on the test set
predicted2 = text_clf2.predict(X_test)
np.mean(predicted2 == y_test)

0.63602507896101446