In [62]:
import nltk
import itertools
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('complaints.csv', sep='\t')
data.head()
y = data["PRODUCT_ID"]
X = data["cleaned_text"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Voting classifier

In [28]:
clf1 = LogisticRegression(multi_class='multinomial', random_state=15, max_iter=150)
clf2 = SGDClassifier(loss='log', max_iter=1000)
clf3 = MultinomialNB()

eclf = VotingClassifier(estimators=[
        ('lr', clf1), ('sgd', clf2), ('mnb', clf3)], voting='soft')

voting = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=500)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', eclf),
    ])
voting = voting.fit(X_train, y_train)

In [29]:
predictions = voting.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

Precision:   0.68
Recall:   0.68
F1-measure:   0.68
Accuracy:   0.67


### Bagging

In [56]:
clf1 = LogisticRegression(multi_class='multinomial', random_state=3, max_iter=90)
clf2 = ExtraTreesClassifier(n_estimators=30, random_state=34, max_depth=24)
clf3 = GaussianNB()

In [57]:
bagging_1 = BaggingClassifier(base_estimator=clf1, n_estimators=30)
bagging_2 = BaggingClassifier(base_estimator=clf2, n_estimators=30)
bagging_3 = BaggingClassifier(base_estimator=clf3, n_estimators=30)

In [58]:
bagging = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=200)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', bagging_1),
    ])
bagging = bagging.fit(X_train, y_train)
predictions = bagging.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Precision:   0.65
Recall:   0.65
F1-measure:   0.65
Accuracy:   0.65


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [59]:
bagging = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=200)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', bagging_2),
    ])
bagging = bagging.fit(X_train, y_train)
predictions = bagging.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

Precision:   0.72
Recall:   0.71
F1-measure:   0.71
Accuracy:   0.71


In [60]:
bagging = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=200)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', bagging_3),
    ])
bagging = bagging.fit(X_train, y_train)
predictions = bagging.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

Precision:   0.55
Recall:   0.55
F1-measure:   0.53
Accuracy:   0.54


### Boosting

In [63]:
clf1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
clf2 = LogisticRegression(multi_class='multinomial', random_state=53, max_iter=150)
clf3 = ExtraTreesClassifier(n_estimators=25, random_state=4, max_depth=18)

In [65]:
ab1 = AdaBoostClassifier(base_estimator=clf1, n_estimators=1)   
ab2 = AdaBoostClassifier(base_estimator=clf2, n_estimators=5)  
ab3 = AdaBoostClassifier(base_estimator=clf3, n_estimators=15)   

In [66]:
ab = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=200)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', ab1),
    ])
ab = ab.fit(X_train, y_train)
predictions = ab.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

Precision:   0.24
Recall:   0.30
F1-measure:   0.21
Accuracy:   0.32


  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
ab = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=200)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', ab2),
    ])
ab = ab.fit(X_train, y_train)
predictions = ab.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

Precision:   0.20
Recall:   0.34
F1-measure:   0.23
Accuracy:   0.36


  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
ab = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_features=200)),
    ('tfidf', TfidfTransformer(sublinear_tf=True)),
    ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('clf', ab3),
    ])
ab = ab.fit(X_train, y_train)
predictions = ab.predict(X_test)
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

Precision:   0.71
Recall:   0.70
F1-measure:   0.70
Accuracy:   0.70
