https://www.kaggle.com/dushyantv/consumer_complaints

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../dataset/Consumer_Complaints.csv')

In [None]:
df.head()

In [None]:
from io import StringIO

In [None]:
df.columns

In [None]:
col = ['Product','Consumer Complaint']
df = df[col]
df = df[pd.notnull(df['Consumer Complaint'])]
df.columns = ['Product', 'Consumer_complaint_narrative']
df['category_id'] = df['Product'].factorize()[0]
category_id_df = df[['Product', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Product']].values)
df.head()

In [None]:
# 不平衡类
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('Product').Consumer_complaint_narrative.count().plot.bar(ylim=0)
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

sublinear_df 设为 True 从而使用频率的对数形式。

min_df 是单词必须存在的最小文档数量。

norm 设为 l2，以确保我们所有特征向量的欧几里德范数为 1。

ngram_range 设为 (1, 2)，表示我们想要考虑 unigrams 和 bigrams。

stop_words 设为 "english" 来删除所有常用代词 ("a", "the", ...) 以减少噪音特征的数量。

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True,min_df=5,norm='l2',encoding='latin-1',ngram_range=(1,2),stop_words='english')
features = tfidf.fit_transform(df.Consumer_complaint_narrative).toarray()
labels = df.category_id
features.shape

In [None]:
# 查找与每个产品最相关的项
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for Product, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Product))
    print(" . Most correlated unigrams:\n. {}".format(
        '\n. '.join(unigrams[-N:])))
    print(" . Most correlated bigrams:\n. {}".format(
        '\n. '.join(bigrams[-N:])))

In [None]:
# 朴素贝叶斯分类器：最适合字数统计的是多项式变体
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['Consumer_complaint_narrative'], df['Product'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
# 拟合好训练集后，做一些预测
print(clf.predict(count_vect.transform(
    ["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."])))

In [None]:
df[df['Consumer_complaint_narrative'] == "This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]

In [None]:
print(clf.predict(count_vect.transform(["I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"])))

In [None]:
df[df['Consumer_complaint_narrative'] == "I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"]

* Logistic 回归
* （多项式）朴素贝叶斯
* 线性支持向量机
* 随机森林

In [None]:
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(
        model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df,
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
# 模型评估
from sklearn.metrics import confusion_matrix
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    features, labels, df.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Product.values, yticklabels=category_id_df.Product.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# 仍然存在大量错误分类
from IPython.display import display
for predicted in category_id_df.category_id:
    for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 10:
    print("'{}' predicted as '{}' : {} examples.".format(
        id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
    display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][[
            'Product', 'Consumer_complaint_narrative']])
    print('')

In [None]:
# 再次使用卡方检验来找到与每个类别最相关的项
model.fit(features, labels)
N = 2
for Product, category_id in sorted(category_to_id.items()):
    indices = np.argsort(model.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(
        feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(
        feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(Product))
    print(" . Top unigrams:\n . {}".format('\n . '.join(unigrams)))
    print(" . Top bigrams:\n . {}".format('\n . '.join(bigrams)))

In [None]:
# 打印每个类的分类报告
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, target_names=df['Product'].unique()))