# Example: Customer complaints

> Dataset https://www.kaggle.com/dushyantv/consumer_complaints

In [None]:
data_path = "data/Consumer_Complaints.csv"

In [None]:
import pandas as pd

df = pd.read_csv(data_path, on_bad_lines="skip")
df.head(3)

In [None]:
df.shape

In [None]:
df.info(memory_usage="deep")

In [None]:
col = ['Product', 'Consumer Complaint']
df = df[col]

In [None]:
df.head()

In [None]:
len(df['Product'].unique())

In [None]:
df['Product'].unique()

In [None]:
df = df[pd.notnull(df['Consumer Complaint'])]
df.columns = ['product', 'consumer_complaint']
df['category_id'] = df['product'].factorize()[0]

category_id_df = df[['product', 'category_id']].drop_duplicates().sort_values('category_id')

category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'product']].values)
df.head()

In [None]:
df.shape

In [None]:
df.info(memory_usage="deep")

In [None]:
df = df.sample(frac=0.1, random_state=0)

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(8,6))

df.groupby('product')['consumer_complaint'].count().plot.bar(ylim=0)
plt.show()

**Text Representation**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=20, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df["consumer_complaint"]).toarray()

In [None]:
labels = df["category_id"]

features.shape

In [None]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for Product, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['consumer_complaint'], df['product'], random_state = 0)

tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
print(clf.predict(tfidf_vect.transform(["""This company refuses to provide me verification and validation of 
                                         debt per my right under the FDCPA. 
                                         I do not believe this debt is mine."""])))

In [None]:
X_test_tfidf = tfidf_vect.transform(X_test)

clf.score(X_test_tfidf, y_test)

In [None]:
y_pred = clf.predict(X_test_tfidf)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df["product"].values, yticklabels=category_id_df["product"].values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()