# Text Classification
 - https://realpython.com/python-keras-text-classification/

## 1. Choosing a Data Set

In [2]:
import pandas as pd

filepath_dict = {'yelp':   'dataset/yelp_labelled.txt',
                 'amazon': 'dataset/amazon_cells_labelled.txt',
                 'imdb':   'dataset/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [8]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


### check.  CountVectorizer

In [19]:
sentences = ['John likes ice cream', 'John hates chocolate.', 'John likes John']

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [21]:
vectorizer.get_feature_names()

['John', 'chocolate', 'cream', 'hates', 'ice', 'likes']

In [22]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0],
       [2, 0, 0, 0, 0, 1]])

## 2. Defining a Baseline Model

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
df_yelp = df[df['source'] == 'yelp']

In [33]:
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

In [38]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=100)

In [39]:
sentences_train[0]

'We will not be coming back.'

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

CountVectorizer()

In [54]:
len(sentences_train)

750

In [53]:
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 7422 stored elements in Compressed Sparse Row format>

In [62]:
print(X_train[0])

  (0, 112)	1
  (0, 136)	1
  (0, 304)	1
  (0, 1012)	1
  (0, 1655)	1
  (0, 1681)	1


In [61]:
print(X_train[0].toarray()[0][112])

1


In [52]:
vectorizer.inverse_transform(X_train[0])

[array(['back', 'be', 'coming', 'not', 'we', 'will'], dtype='<U17')]

## 3. LogisticRegression

In [63]:
from sklearn.linear_model import LogisticRegression

In [70]:
classifier = LogisticRegression()

In [71]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [72]:
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.808


In [75]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
