In [None]:
# https://realpython.com/python-keras-text-classification/
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

In [None]:
df_yelp = df[df['source'] == 'yelp']

yelp_sentences = df_yelp['sentence'].values
yelp_y = df_yelp['label'].values

yelp_sentences_train, yelp_sentences_test, yelp_y_train, yelp_y_test = train_test_split(yelp_sentences, yelp_y, test_size=0.25, random_state=1000)

In [None]:
yelp_vectorizer = CountVectorizer()
yelp_vectorizer.fit(yelp_sentences_train)

yelp_X_train = yelp_vectorizer.transform(yelp_sentences_train)
yelp_X_test  = yelp_vectorizer.transform(yelp_sentences_test)
yelp_X_train

In [None]:
yelp_classifier = LogisticRegression()
yelp_classifier.fit(yelp_X_train, yelp_y_train)
yelp_score = accuracy_score(yelp_y_test, yelp_classifier.predict(yelp_X_test))

print("Accuracy:", yelp_score)

In [3]:
vectorizer = CountVectorizer()
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


In [None]:
positive_review = ['I love this']
review_transformed = vectorizer.transform(positive_review)
review_result = classifier.predict(review_transformed)
print(review_result[0])
print ('Positive review' if review_result[0] == 1 else 'Negative Review')

In [None]:
negative_review = ['I hate this']
review_transformed = vectorizer.transform(negative_review)
review_result = classifier.predict(review_transformed)
print(review_result[0])
print ('Positive review' if review_result[0] == 1 else 'Negative Review')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer =TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7600
Accuracy for amazon data: 0.7720
Accuracy for imdb data: 0.7005
